full_lengther_next 0.0.8 → 0.5.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (36) hide show
  1. data/.gemtest +0 -0
  2. data/History.txt +2 -2
  3. data/Manifest.txt +33 -18
  4. data/Rakefile +4 -2
  5. data/bin/download_fln_dbs.rb +310 -158
  6. data/bin/full_lengther_next +160 -103
  7. data/bin/make_test_dataset.rb +236 -0
  8. data/bin/make_user_db.rb +101 -117
  9. data/bin/plot_fln.rb +270 -0
  10. data/bin/plot_taxonomy.rb +70 -0
  11. data/lib/expresscanvas.zip +0 -0
  12. data/lib/full_lengther_next.rb +3 -3
  13. data/lib/full_lengther_next/classes/artifacts.rb +66 -0
  14. data/lib/full_lengther_next/classes/blast_functions.rb +326 -0
  15. data/lib/full_lengther_next/classes/cdhit.rb +154 -0
  16. data/lib/full_lengther_next/classes/chimeric_seqs.rb +315 -57
  17. data/lib/full_lengther_next/classes/common_functions.rb +105 -63
  18. data/lib/full_lengther_next/classes/exonerate_result.rb +258 -0
  19. data/lib/full_lengther_next/classes/fl_analysis.rb +226 -617
  20. data/lib/full_lengther_next/classes/fl_string_utils.rb +4 -2
  21. data/lib/full_lengther_next/classes/fln_stats.rb +598 -557
  22. data/lib/full_lengther_next/classes/handle_db.rb +30 -0
  23. data/lib/full_lengther_next/classes/my_worker.rb +308 -138
  24. data/lib/full_lengther_next/classes/my_worker_EST.rb +54 -0
  25. data/lib/full_lengther_next/classes/my_worker_manager_EST.rb +69 -0
  26. data/lib/full_lengther_next/classes/my_worker_manager_fln.rb +389 -0
  27. data/lib/full_lengther_next/classes/nc_rna.rb +5 -7
  28. data/lib/full_lengther_next/classes/reptrans.rb +210 -0
  29. data/lib/full_lengther_next/classes/sequence.rb +439 -80
  30. data/lib/full_lengther_next/classes/test_code.rb +15 -16
  31. data/lib/full_lengther_next/classes/types.rb +12 -0
  32. data/lib/full_lengther_next/classes/une_los_hit.rb +148 -230
  33. data/lib/full_lengther_next/classes/warnings.rb +40 -0
  34. metadata +207 -93
  35. data/lib/full_lengther_next/classes/lcs.rb +0 -33
  36. data/lib/full_lengther_next/classes/my_worker_manager.rb +0 -240
@@ -1,78 +1,116 @@
1
1
 
2
2
  module CommonFunctions
3
3
 
4
- def contenidos_en_prot(hit, full_prot, q)
5
-
6
- is_ok = false
7
- q_index_start = 9999
8
- fr_index_start = 0
9
- min_index_start = 9999
10
- aas_parecidos = 0
11
- masked_x = 0
12
- suma_fragments = 0
13
-
14
- masked_x = hit.q_seq.count('X')
15
- masked_x = masked_x + hit.q_seq.count('-')
16
-
17
- full_prot = full_prot.gsub(/[\-Xx]+/,'')
18
- compare_prot = hit.q_seq.gsub(/[\-Xx]+/,'-')
19
- fragments_array = compare_prot.split(/\-+/)
20
-
21
- fragments_array.each do |seq|
22
- # puts "seq: #{seq}\nfull_prot: #{full_prot}"
23
- simliar_fragment = full_prot.lcs(seq)
24
- suma_fragments += simliar_fragment.length
25
-
26
- fr_index_start = full_prot.index(simliar_fragment)
27
-
28
- if (q_index_start == 9999)
29
- q_index_start = fr_index_start
4
+ def contenidos_en_prot(key_seq, full_prot)
5
+ full_prot = full_prot.gsub(/[\-Xx]/,'-')
6
+ compare_prot = key_seq.gsub(/[\-Xx]/,'-')
7
+ q_index_start = full_prot.index(compare_prot) #Full match between hit.q_seq and full_prot (unigene)
8
+ if q_index_start.nil? #There is gaps that unables the full match
9
+ q_index_start = match_with_ungapped_reference(full_prot, compare_prot)
10
+ if q_index_start.nil? && full_prot.include?('-')
11
+ diff = full_prot.length - compare_prot.length
12
+ if scan_sequences(full_prot.split(''), compare_prot.split('')) == compare_prot.length
13
+ q_index_start = 0
14
+ end
15
+
16
+ if diff >0 && scan_sequences(full_prot.split(''), compare_prot.split(''), diff) == compare_prot.length
17
+ q_index_start = diff
18
+ end
19
+
20
+ if q_index_start.nil?
21
+ q_index_start = match_with_gapped_reference(full_prot, compare_prot)
22
+ end
23
+ end
24
+ if q_index_start.nil?
25
+ q_index_start = 0
30
26
  end
31
- full_prot = full_prot[(fr_index_start + simliar_fragment.length)..full_prot.length]
32
- end
33
-
34
- simliar_fragment = full_prot.lcs(compare_prot)
35
-
36
- # if ($verbose)
37
- # puts "#{q.query_def}-------------------------------------#{suma_fragments} de #{compare_prot.length}"
38
- # puts "#{q.query_def}-------------------------------------#{suma_fragments + masked_x} >= #{compare_prot.length * 0.7}"
39
- # puts "\nfull: #{full_prot}\ncomp: #{compare_prot}\nsimliar_fragment: #{simliar_fragment}"
40
- # end
41
-
42
- if (suma_fragments + masked_x >= compare_prot.length * 0.7)
43
- is_ok = true
44
- # puts "OK -- encontramos suficiente similitud entre query y subject -- OK"
45
- else
46
- is_ok = false
47
- # puts "\nfull: #{full_prot}\ncomp: #{compare_prot}"
48
- # puts "Warning!: no match comparing proteins"
49
- end
50
27
 
51
- min_index_start = [min_index_start, q_index_start].min
52
-
53
- if (min_index_start == 9999)
54
- min_index_start = 0
55
28
  end
56
-
57
- return [is_ok, min_index_start]
29
+ return q_index_start
58
30
  end
59
31
 
60
32
 
33
+ def match_with_gapped_reference(full_prot, compare_prot)
34
+ q_index_start = nil
35
+ fragments_array = full_prot.split(/\-+/)
36
+ fragments_array.each_with_index do |seq, i|
37
+ if seq.length > 4
38
+ compare_prot_index = compare_prot.index(seq)
39
+ if compare_prot_index.nil? # In cases that no match by gaps
40
+ seq =seq[0..4]
41
+ compare_prot_index = compare_prot.index(seq)
42
+ end
43
+ if !compare_prot_index.nil?
44
+ q_index_start = full_prot.index(seq)
45
+ if i > 0
46
+ q_index_start, compare_prot_index = extend_match(full_prot, compare_prot, q_index_start, compare_prot_index)
47
+ end
48
+ break
49
+ end
50
+ end
51
+ end
52
+ return q_index_start
53
+ end
61
54
 
55
+ def extend_match(full_prot, compare_prot, q_index_start, compare_prot_index)
56
+ full_prot_substring = full_prot[0..q_index_start-1].reverse.split('')
57
+ compare_prot_substring = compare_prot[0..compare_prot_index-1].reverse.split('')
58
+ extend_match = scan_sequences(full_prot_substring, compare_prot_substring)
59
+ q_index_start -= extend_match
60
+ compare_prot_index -= extend_match
61
+ return q_index_start, compare_prot_index
62
+ end
62
63
 
63
- def reverse_seq(query_fasta, h_qframe, h_qstart, h_qend)
64
-
65
- q_frame = -h_qframe.to_i
66
-
67
- q_beg = query_fasta.length - h_qend - 1
68
- q_end = query_fasta.length - h_qstart - 1
64
+ def scan_sequences(ref_seq, compare_seq, diff = 0)
65
+ extend_match = 0
66
+ ref_seq.each_with_index do |char,i|
67
+ if i >= diff
68
+ compare_char = compare_seq[extend_match]
69
+ if compare_char.nil? || char != compare_char && char != '-' && compare_char != '-'
70
+ break
71
+ end
72
+ extend_match += 1
73
+ end
74
+ end
75
+ return extend_match
76
+ end
69
77
 
70
- query_fasta = query_fasta.complementary_dna
78
+ def match_with_ungapped_reference(full_prot, compare_prot)
79
+ q_index_start = nil
80
+ fragments_array = compare_prot.split(/\-+/)
81
+ fragments_array.each_with_index do |seq, i|
82
+ if q_index_start.nil? && seq.length > 4
83
+ q_index_start = full_prot.index(seq)
84
+ if i > 0 && !q_index_start.nil?
85
+ q_index_start = refine_match(seq, compare_prot, q_index_start) # Correction if first seq isn't enough large
86
+ end
87
+ break
88
+ end
89
+ end
90
+ return q_index_start
91
+ end
71
92
 
72
- # el qend y el qstart estan al reves porque cuando la seq tiene frame negativo el blast los pone al reves
73
- return [query_fasta, q_frame, q_beg, q_end]
93
+ def refine_match(subseq, seq, q_index_start)
94
+ location_seq = seq.index(subseq)
95
+ gaps_on_location = seq[0..location_seq].count('-')
96
+ q_index_start -= location_seq - gaps_on_location # Correction if first seq isn't enough large
97
+ return q_index_start
74
98
  end
75
99
 
100
+ def reverse_seq(query_fasta, hit)
101
+ hit.q_frame = -hit.q_frame
102
+ hit.q_end = query_fasta.length - 1 - hit.q_end
103
+ hit.q_beg = query_fasta.length - 1 - hit.q_beg
104
+ hit.reversed = TRUE
105
+ query_fasta = query_fasta.complementary_dna # ESTO REALMENTE HACE LA REVERSO COMPLEMENTARIA.
106
+ if hit.class.to_s == 'ExoBlastHit'
107
+ hit.q_frameshift.map!{|position, num_nts|
108
+ reversed_position = query_fasta.length - 1 - position
109
+ [reversed_position, num_nts]
110
+ }
111
+ end
112
+ return query_fasta
113
+ end
76
114
 
77
115
 
78
116
  def corrige_frame(ref_frame,ref_start,ref_end)
@@ -89,6 +127,10 @@ module CommonFunctions
89
127
 
90
128
  end
91
129
 
92
-
93
-
94
- end
130
+ def check_frame_shift(hit)
131
+ fs = 0
132
+ prot_length_in_nts = hit.q_end-hit.q_beg+1
133
+ fs = prot_length_in_nts%3
134
+ return fs
135
+ end
136
+ end
@@ -0,0 +1,258 @@
1
+ # Copyright (c) 2010 Dario Guerrero & Almudena Bocinos
2
+ #
3
+ # Permission is hereby granted, free of charge, to any person obtaining
4
+ # a copy of this software and associated documentation files (the
5
+ # 'Software'), to deal in the Software without restriction, including
6
+ # without limitation the rights to use, copy, modify, merge, publish,
7
+ # distribute, sublicense, and/or sell copies of the Software, and to
8
+ # permit persons to whom the Software is furnished to do so, subject to
9
+ # the following conditions:
10
+ #
11
+ # The above copyright notice and this permission notice shall be
12
+ # included in all copies or substantial portions of the Software.
13
+ #
14
+ # THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND,
15
+ # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
+ # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
17
+ # IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
18
+ # CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
19
+ # TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
20
+ # SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
21
+
22
+ require 'blast_query'
23
+ require 'blast_hit'
24
+ require 'fl_string_utils.rb'
25
+
26
+ OPERATION = 0
27
+ QUERY = 1
28
+ TARGET = 2
29
+
30
+ class ExoBlastHit < BlastHit
31
+ attr_accessor :q_frameshift, :s_frameshift
32
+ def initialize(start_target, ends_target, start_query, ends_query)
33
+ super(start_target, ends_target, start_query, ends_query)
34
+ @s_frameshift=[]
35
+ @q_frameshift=[]
36
+ end
37
+ end
38
+
39
+ # Extracts results from blast table's file and uses it to create instances of "BlastQuery" and "BlastHit"
40
+ class ExonerateResult
41
+
42
+ # Parser initialization
43
+ def initialize(input, seqs= nil, query_seqs = nil, all = TRUE)
44
+ @querys = []
45
+ @seqs = seqs #unigenes
46
+ @prot_seqs = query_seqs#prot
47
+
48
+ if input.is_a?(Array)
49
+ input.each do |file|
50
+ parse_file(File.open(file).readlines, all)
51
+ end
52
+ else
53
+ parse_file(File.open(input).readlines, all)
54
+ end
55
+ query_name=''
56
+ end
57
+
58
+ def parse_file(lines, all)
59
+ lines_parsed=[]
60
+ lines_parsed={} if !all
61
+ lines.each do |line|
62
+ if line=~ /^vulgar:/
63
+ line.chomp!
64
+ fields=line.split(' ', 11)
65
+ features={'query_id'=> fields[1], 'query_start_align'=> fields[2].to_i, 'query_end_align'=> fields[3].to_i, 'query_strand'=> fields[4],'target_id'=> fields[5], 'target_start_align'=> fields[6].to_i, 'target_end_align'=> fields[7].to_i, 'target_strand'=> fields[8], 'score'=> fields[9].to_i, 'align_data'=> fields[10]}
66
+ if all
67
+ lines_parsed << features
68
+ else
69
+ if !lines_parsed.key?(features['target_id']) # Añadir valor si no existe
70
+ lines_parsed[features['target_id']]=features
71
+ else
72
+ if features['score']>lines_parsed[features['target_id']]['score'] # Si ya existe una query, ver si la nueva presenta un mayor score y reemplazar la antigua
73
+ lines_parsed[features['target_id']]=features
74
+ end
75
+ end
76
+ end
77
+ end
78
+ end
79
+ convert_parsed_lines(lines_parsed)
80
+ end
81
+
82
+ def convert_parsed_lines(lines_parsed)
83
+ last_query = nil
84
+ query = nil
85
+ lines_parsed.each_with_index do |line|
86
+ begin
87
+ if lines_parsed.class.to_s=='Array'
88
+ align_data=line['align_data']
89
+ features=line
90
+ else #hash
91
+ align_data=line[1]['align_data']
92
+ features=line[1]
93
+ end
94
+ tags = align_data.scan(/([MFG53S]) ([0-9]+) ([0-9]+)/)
95
+ tags.map!{|tag| [tag[0], tag[1].to_i, tag[2].to_i]}
96
+ if features['target_id'] != last_query
97
+ last_query = features['target_id']
98
+ query = BlastQuery.new(features['target_id'])
99
+ @querys << query
100
+ end
101
+ hiting(features,tags, query)
102
+ rescue
103
+ puts "Result: #{features['target_id']} => #{features['query_id']} hasn't been parsed\n#{line}"
104
+ end
105
+ end
106
+ end
107
+
108
+ #this method only works fine with --model protein2dna parameter of exonerate
109
+ def hiting(features, tags, query) #Convierte las coordenadas relativas del exonerate a absolutas tipo blast, definiendo solo los hits
110
+ do_align = FALSE
111
+ do_align = TRUE if !@prot_seqs.nil? && !@seqs.nil?
112
+ start_target = features['target_start_align']#Unigen
113
+ start_query = features['query_start_align'] #proteina
114
+ ends_target = features['target_end_align']
115
+ ends_query = features['query_end_align']-1 # -> Exonerate don't set to 0 position the ends of target and query
116
+ if features['target_strand'] == '-' #-> Exonerate don't set to 0 position the ends of target and query
117
+ start_target -= 1 # Start target is end target when mathc is in reversed complementary strand
118
+ else
119
+ ends_target -= 1
120
+ end
121
+ hit = ExoBlastHit.new(start_target+1, ends_target+1, start_query+1, ends_query+1)
122
+ define_hit_parameters(hit, features, tags)
123
+ query.add_hit(hit)
124
+
125
+ #Define alignment and blast like parameters
126
+ target_alignment = ''
127
+ query_alignment = ''
128
+ counter_target = start_target
129
+ counter_query = start_query
130
+ if do_align #get seqs
131
+ query_seq = @prot_seqs[features['query_id']]
132
+ target_seq = @seqs[features['target_id']]
133
+ end
134
+ counter_target, target_seq = do_reverso_complementary(counter_target, target_seq) if features['target_strand'] == '-'
135
+ query_frameshift = []
136
+ target_frameshift = []
137
+ gap_shift = 0
138
+ #puts features['query_id']+ ' ' +features['target_strand'], '-----------------------'
139
+ tags.each_with_index do |tag, n_operation|
140
+ #puts tag.inspect
141
+ if do_align
142
+ gap_shift = 0 if tag[OPERATION] != 'G'
143
+ query_alignment << query_seq[counter_query, tag[QUERY]]
144
+ target_alignment << target_seq[counter_target, tag[TARGET]].translate
145
+ end
146
+ if tag[OPERATION] == 'F'
147
+ if tag[TARGET] > 0 && tag[TARGET] < 3 #TRUE FRAMESHIFT
148
+ gap_shift += 1
149
+ if tags[n_operation+1][OPERATION] != 'G' #there are frameshift that not insert a gap, we do it
150
+ query_alignment << '-' if do_align
151
+ end
152
+ else
153
+ query_alignment << '-' * (tag[TARGET]/3.0).ceil if do_align
154
+ end
155
+ query_frameshift << counter_query
156
+ fs_counter_target = counter_target
157
+ fs_counter_target = target_seq.length - counter_target if features['target_strand'] == '-' # ESto es un apaño, habria que plantear el parseo de las reversas como reduccion en el contador del formato del exonerate, en vez de como adiccion
158
+ if tag[TARGET] > 3
159
+ real_fs = tag[TARGET]%3
160
+ real_gap = tag[TARGET] - real_fs
161
+ fs = [fs_counter_target + real_gap, real_fs]
162
+ else
163
+ fs = [fs_counter_target, tag[TARGET]]
164
+ end
165
+ target_frameshift << fs
166
+ elsif tag[OPERATION] == 'G'
167
+ query_alignment << '-' * (tag[TARGET]/3.0).ceil if do_align
168
+ diff = tag[QUERY] - gap_shift
169
+ target_alignment << '-' * diff if do_align && diff > 0
170
+ gap_shift = 0
171
+ end
172
+ counter_query += tag[QUERY]
173
+ counter_target += tag[TARGET]
174
+ end
175
+ hit.s_frameshift = query_frameshift
176
+ hit.q_frameshift = target_frameshift
177
+
178
+ #puts "\e[33m#{target_alignment}\e[0m", "\e[36m#{query_alignment}\e[0m"
179
+ if do_align
180
+ hit.q_seq = target_alignment
181
+ hit.s_seq = query_alignment
182
+ hit.align_len = query_alignment.length
183
+ hit.ident = set_ident(target_alignment,query_alignment)
184
+ end
185
+ end #def
186
+
187
+ def do_reverso_complementary(counter_target, target_seq)
188
+ counter_target = target_seq.length - 1 - counter_target
189
+ target_seq = target_seq.complementary_dna
190
+ return counter_target, target_seq
191
+ end
192
+
193
+ def set_ident(target_alignment, query_alignment)
194
+ matchs = 0
195
+ position = 0
196
+ target_alignment.each_char do |char|
197
+ matchs +=1 if char == query_alignment[position]
198
+ position +=1
199
+ end
200
+ perc_ident = ('%.2f' % (matchs*100.0/target_alignment.length)).to_f
201
+ return perc_ident
202
+ end
203
+
204
+ def define_hit_parameters(hit, features, tags)
205
+ hit.gaps = 0
206
+ tags.map{|aln| hit.gaps += 1 if aln[0] == 'G'}
207
+ hit.reversed = FALSE
208
+ hit.align_len =(features['query_end_align'] - features['query_start_align']).abs+1
209
+ hit.mismatches=0
210
+ hit.e_val=0
211
+ hit.bit_score=0
212
+ hit.score = features['score']
213
+ hit.s_frame = nil
214
+ strand = 1
215
+ strand = -1 if features['target_strand'] == '-'
216
+ hit.q_frame = (((features['target_start_align']) % 3) +1) *strand
217
+ hit.subject_id = features['query_id']
218
+ hit.full_subject_length=0
219
+ hit.definition=''
220
+ hit.acc=features['query_id']
221
+ hit.q_seq=''
222
+ hit.s_seq=''
223
+ end
224
+
225
+ # inspect results
226
+ def inspect
227
+ res = "Exonerate results:\n"
228
+ res+= '-'*20
229
+ res+= "\nQuerys: #{@querys.count}\n"
230
+ @querys.each{|q| res+=q.inspect+"\n"}
231
+ return res
232
+ end
233
+
234
+ # find query by name
235
+ def find_query(querys,name_q)
236
+ # newq = querys.find{|q| ( q.find{|h| (h.subject_id)})}
237
+ new_q=nil
238
+
239
+ if !querys.empty?
240
+ new_q=querys.find{|q| (q.query_id==name_q)}
241
+ end
242
+
243
+ return new_q
244
+ end
245
+
246
+ # check if there are querys
247
+ def empty?
248
+
249
+ return @querys.empty?
250
+ end
251
+
252
+ # get query count
253
+ def size
254
+ @querys.size
255
+ end
256
+
257
+ attr_accessor :querys
258
+ end
@@ -1,688 +1,297 @@
1
-
1
+ require 'types'
2
2
  require 'une_los_hit'
3
3
 
4
4
  module FlAnalysis
5
5
 
6
- def analiza_orf_y_fl(seq, blast_query, options, db_name)
7
- aas_n_end = options[:distance]
8
- pident_threshold = options[:ident]
9
- evalue_threshold = options[:evalue]
10
- # @verbose = options[:verbose]
11
-
12
- # test_blast_hits(blast_query)
13
-
14
- # used to detect if the sequence and the blast are from different query
15
- if seq.seq_name != blast_query.query_def
16
- raise "BLAST query name and sequence are different"
6
+ $global_warnings = []
7
+
8
+ def analiza_orf_y_fl(seq, hit, options, db_name)
9
+ query_fasta = seq.seq_fasta.upcase.dup # Upcase for prevents complications with masked sequences, dup for discard changes
10
+ if hit.count > 1 # if the sequence has more than one hit, the frames are checked and fixed to get a single hit
11
+ seq_unida = UneLosHit.new(hit, query_fasta)
12
+ full_prot = seq_unida.full_prot
13
+ query_fasta = seq_unida.output_seq # repaired fasta
14
+ final_hit = seq_unida.final_hit # single hit
15
+ $global_warnings += seq_unida.msgs # warning messages
16
+ else
17
+ query_fasta = reverse_seq(query_fasta, hit.first) if hit.first.q_frame < 0 # si la secuencia esta al reves le damos la vuelta
18
+ final_hit = hit.first # single hit
17
19
  end
18
-
19
- q=blast_query
20
- msgs = ''
21
- atg_status = ''
22
- end_status = ''
23
- final_status = ''
24
-
25
- # the fasta sequence is saved
26
- query_fasta = seq.seq_fasta
20
+ query_fasta = exonerate_fix_frame_shift(query_fasta, hit) if options[:exonerate]
27
21
 
28
- if q.hits[0].nil? # There is no match in blast, the seq go to the next DB
29
- # puts "#{db_name} -- #{q.query_def} --> NO BLASTX match"
30
-
31
- # If the DB is trembl and the seq has annotations from other DB the annotations must be printed
32
- if (db_name =~ /^tr_/)
33
- if (seq.get_annotations(:tmp_annotation).empty?)
34
- if (seq.sec_desc.empty?)
35
- seq.annotate(:apply_tcode,'')
36
- else
37
- seq.annotate(:tmp_annotation,[seq.sec_desc, '','',''],true)
38
- end
39
- else
40
- save_last_db_annotations(seq)
41
- end
42
- end
43
-
44
- return
45
- end
46
- #----------------------------------------------------------------------------------------------------------
47
- warnings = ''
48
- errors = ''
49
- wrong_seq = false
22
+ full_prot = query_fasta[final_hit.q_frame-1, query_fasta.length+1].translate
23
+ original_query_coordinates = [final_hit.q_beg, final_hit.q_end] ## VERBOSE
24
+ seq.show_alignment(final_hit, query_fasta, show_nts) if $verbose > 2 ## VERBOSE
25
+ atg_status, tmp_prot = set_start_codon(final_hit, options[:distance], full_prot, query_fasta)
26
+ end_status, final_prot = find_end(final_hit, options[:distance], tmp_prot, query_fasta)
50
27
 
51
- # if the sequence has more than one hit, the frames are checked and fixed to get an single hit
52
- if (q.hits.count > 1)
53
-
54
- seq_unida = UneLosHit.new(q, query_fasta, pident_threshold)
55
-
56
- wrong_seq = seq_unida.wrong_seq
57
- is_ok = seq_unida.is_ok
58
- q_index_start = seq_unida.q_index_start
59
- full_prot = seq_unida.full_prot
60
-
61
- query_fasta = seq_unida.output_seq # repaired fasta
62
-
63
- final_hit = seq_unida.final_hit # single hit
64
- msgs = seq_unida.msgs # warning messages
65
- x_number = seq_unida.number_x # number of nucleotides used to fix frame errors
66
-
67
- else # if there is only one hit
28
+ puts "\n------------------- POST EXTENSION---------------------" if $verbose > 1 ## VERBOSE
29
+ seq.show_alignment(final_hit, query_fasta, show_nts, original_query_coordinates) if $verbose > 1 ## VERBOSE
30
+ puts "ATG: #{atg_status} STOP: #{end_status}" if $verbose > 2 ## VERBOSE
68
31
 
69
- if (q.hits[0].q_frame.to_i < 0) # si la secuencia esta al reves le damos la vuelta
70
- (query_fasta, q.hits[0].q_frame, q.hits[0].q_beg, q.hits[0].q_end) = reverse_seq(query_fasta, q.hits[0].q_frame, q.hits[0].q_beg, q.hits[0].q_end)
71
- q.hits[0].reversed = true
72
- end
73
-
74
- final_hit = q.hits[0] # single hit
75
- x_number = 0 # number of nucleotides used to fix frame errors
76
-
77
- full_prot = query_fasta[final_hit.q_frame-1, query_fasta.length+1].translate
78
- (is_ok, q_index_start) = contenidos_en_prot(final_hit, full_prot, q)
79
- end
80
- # test_final_hit(final_hit, query_fasta)
81
- #----------------------------------------------------------------------------------------------------------
82
- if wrong_seq
83
- warnings = "ERROR#1, contains sense and antisense hits!!!, putative chimeric sequence, " + warnings
84
- # puts "ERROR#1, contains sense and antisense hits!!!, putative chimeric sequence"
85
- errors = "#{db_name}\t#{q.hits[0].acc}\tERROR#1\tcontains sense and antisense hits!!!, putative chimeric sequence, "
86
- error_log(q, seq, warnings, db_name)
87
- return
88
- end
89
- #----------------------------------------------------------------------------------------------------------
90
- warnings += msgs
91
- msgs = ''
92
- #----------------------------------------------------------------------------------------------------------
93
- if (x_number < 0)
94
- warnings = "ERROR#2, unexpected negative index in x_number, " + warnings
95
- # puts "ERROR#2, unexpected negative index in x_number"
96
- errors = "#{db_name}\t#{q.hits[0].acc}\tERROR#2\tunexpected negative index in x_number, "
97
- error_log(q, seq, warnings, db_name)
98
- return
99
- end
100
- #----------------------------------------------------------------------------------------------------------
101
- if (!is_ok)
102
- warnings = "ERROR#3, very serious frame error, " + warnings
103
- # puts "#{q.query_def} ERROR#3, hit was NOT found in the protein"
104
- errors = "#{db_name}\t#{q.hits[0].acc}\tERROR#3\thit was NOT found in the protein, "
105
- # error_log(q, seq, warnings, db_name)
106
- # return
107
- end
108
- #----------------------------------------------------------------------------------------------------------
109
- fiable = false
110
- if ((final_hit.ident >= pident_threshold) && (final_hit.e_val <= evalue_threshold))
111
- fiable = true
32
+ # decide the sequence status (Complete, Putative Complete, Internal, N-terminus, Putative N-terminus, C-terminus)
33
+ type, status = determine_status(atg_status, end_status)
34
+ status = compare_seq_length_with_subject(final_prot, options[:distance], final_hit, type, status)
35
+ if final_prot.length >= 25 && final_prot.length.to_f/final_hit.full_subject_length >= options[:subject_coverage] # Prot length min of 25 aa and subject coverage by generated prot of 25%
36
+ save_annotations(seq, final_hit, type, status, final_prot, query_fasta, db_name)
112
37
  end
113
- # if the query protein is large enough at the start of the sequence should have the start codon
114
- if (final_hit.q_beg/3 + aas_n_end >= final_hit.s_beg.to_i)
115
- substring = full_prot[0, q_index_start + 10]
116
- resto_substring = full_prot[q_index_start + 10, full_prot.length - q_index_start - 10]
38
+ end
117
39
 
118
- # to look for the beginning of the protein
119
- (m_substring, atg_status, msgs) = find_start(final_hit.s_beg, substring, fiable, aas_n_end)
120
40
 
121
- # pasting the substring sequence with the rest of the sequence
122
- tmp_prot = "#{m_substring}#{resto_substring}"
123
- # to get the value of the start_ORF index
124
- final_hit.q_beg = final_hit.q_beg.to_i - ((m_substring.length - 10) * 3)
41
+ def set_start_codon(final_hit, distance, full_prot, query_fasta)
42
+ q_index_start = contenidos_en_prot(final_hit.q_seq, full_prot)
43
+ atg_status = nil
44
+ _5prima = q_index_start + distance
45
+
46
+ if final_hit.s_beg == 0 && final_hit.q_seq[0] == 'M' && final_hit.s_seq[0] == 'M' #there is M in query and subject at first position of alignment and subject's M is in first position
47
+ atg_status = 'complete'
48
+ tmp_prot = full_prot[q_index_start..full_prot.length]
49
+ elsif _5prima >= final_hit.s_beg
50
+ amine_seq = full_prot[0, _5prima] #Contiene parte amino de la proteina
51
+ carboxile_seq = full_prot[_5prima, full_prot.length - _5prima] #Contiene parte carboxilo de la proteina hasta el fin de la secuencia
52
+ length_before_cut = amine_seq.length
53
+ amine_seq, atg_status = find_start(final_hit.s_beg, amine_seq, distance) # to look for the beginning of the protein
54
+ tmp_prot = "#{amine_seq}#{carboxile_seq}" # merge seqs in prot
55
+ new_q_beg = final_hit.q_frame-1 + (length_before_cut - amine_seq.length) * 3
56
+ modify_5p_align(new_q_beg, final_hit, query_fasta) if $verbose > 1 ## VERBOSE, Modify query align
57
+ final_hit.q_beg = new_q_beg # to get the value of the start_ORF index
125
58
  else
126
- # if (@verbose)
127
- # puts "beginning too short!"
128
- # end
129
-
59
+ $global_warnings << 'UnexpStopBegSeq' if full_prot[0, q_index_start].rindex('*')
130
60
  atg_status = 'incomplete'
131
- substring = full_prot[0, q_index_start]
132
- distance_s_atg = (final_hit.s_beg.to_i - final_hit.q_beg/3) + 1
133
-
134
- if (substring.rindex('*'))
135
- warnings += "Unexpected stop codon in the beginning of your sequence, "
136
- # if (@verbose)
137
- # puts "#{db_name} -- #{q.query_def} --> Unexpected stop codon in the beginning of your sequence"
138
- # end
139
- end
140
-
141
- final_hit.q_beg = final_hit.q_beg.to_i - (substring.length * 3)
142
61
  tmp_prot = full_prot
143
62
  end
144
- #----------------------------------------------------------------------------------------------------------
145
- # look for the end of the protein
146
- (resto_substring, end_substring, end_status, warnings, putative_end) = find_end(final_hit, q, full_prot, tmp_prot, end_status, warnings, aas_n_end)
147
- #----------------------------------------------------------------------------------------------------------
148
- final_prot = "#{resto_substring}#{end_substring}"
149
-
150
- warnings += msgs
151
-
152
- # to get the value of the end_ORF index
153
- if (atg_status == 'complete')
154
- final_hit.q_end = final_hit.q_beg - 3 + (final_prot.length * 3)
155
- else
156
- if (putative_end)
157
- final_hit.q_end = final_hit.q_end - 45 + (putative_end*3)
158
- end
159
- end
160
-
161
- #--------------------------------------------------------------------------------------------------------------
162
- # decide the sequence status (Complete, Putative Complete, Internal, N-terminus, Putative N-terminus, C-terminus)
163
- final_status = determine_status(atg_status,end_status)
164
- #----------------------------------------------------------------------------------------------------------
165
- if (final_prot.length - 2*aas_n_end > final_hit.full_subject_length)
166
- warnings += " your sequence is longer than subject: #{final_prot.length} - #{final_hit.full_subject_length}"
167
63
 
168
- elsif (final_prot.length + aas_n_end < final_hit.full_subject_length)
169
- warnings += " your sequence is shorter than subject: #{final_prot.length} - #{final_hit.full_subject_length}"
170
- if (final_prot.length + 100 < final_hit.full_subject_length) || (final_prot.length*2 < final_hit.full_subject_length)
171
-
172
- if (final_status == 'Complete')
173
- final_status = 'Putative Complete'
174
- warnings += ". Was predicted as Complete, but is very much shorter than de subject"
175
- # if (@verbose)
176
- # puts "#{db_name} -- #{q.query_def} --> your sequence is 100 aas shorter than the subject or shorter than the half length of the subject"
177
- # end
178
- end
179
- end
180
- end
181
-
182
- # test_final_hit(final_hit, query_fasta)
183
- print_annotations(seq, q, final_hit, final_status, final_prot, warnings, query_fasta, db_name)
184
-
185
- end
186
-
187
-
188
- def test_blast_hits(q)
189
-
190
- puts "query_def: #{q.query_def} full_query_length: #{q.full_query_length} ------------------------------------------------"
191
-
192
- q.hits.each do |h|
193
- puts "\t subject_id: #{h.acc}"
194
- puts "\t acc: #{h.acc}"
195
- puts "\t full_subject_length: #{h.full_subject_length}"
196
- puts "\t q_beg: #{h.q_beg + 1}"
197
- puts "\t q_end: #{h.q_end + 1}"
198
- puts "\t q_frame: #{h.q_frame}"
199
- puts "\t s_beg: #{h.s_beg + 1}"
200
- puts "\t s_end: #{h.s_end + 1}"
201
- puts "\t s_frame: #{h.s_frame}"
202
- puts "\t align_len: #{h.align_len}"
203
- puts "\t gaps: #{h.gaps}"
204
- puts "\t mismatches: #{h.mismatches}"
205
- puts "\t reversed: #{h.reversed}"
206
- puts "\t score: #{h.score}"
207
- puts "\t bit_score: #{h.bit_score}"
208
- puts "\t ident: #{h.ident}"
209
- puts "\t e_val: #{h.e_val}"
210
- puts "\t definition: #{h.definition}"
211
- puts "\t q_seq: #{h.q_seq}"
212
- puts "\t s_seq: #{h.s_seq}"
213
-
214
- end
215
-
216
- end
217
-
218
-
219
- def test_final_hit(final_hit, query_fasta)
220
-
221
- puts "\t acc: #{final_hit.acc}"
222
- puts "\t full_subject_length: #{final_hit.full_subject_length}"
223
-
224
- puts "\n\t q_frame: #{final_hit.q_frame}"
225
- puts "\t reversed: #{final_hit.reversed}"
226
-
227
- puts "\n\t q_beg-q_end: #{final_hit.q_beg + 1} - #{final_hit.q_end + 1}"
228
- puts "\t s_beg - s_end: #{final_hit.s_beg + 1} - #{final_hit.s_end + 1}"
229
-
230
- puts "\n\t score: #{final_hit.score}, bit_score: #{final_hit.bit_score}, ident: #{final_hit.ident}, e_val: #{final_hit.e_val}"
231
-
232
- puts "\n\t definition: #{final_hit.definition}"
233
- puts "\t q_seq: #{final_hit.q_seq}"
234
- puts "\t s_seq: #{final_hit.s_seq}"
235
-
236
- puts "\nnt q_beg-q_end\n#{query_fasta[final_hit.q_beg..final_hit.q_end]}"
237
- puts "\n\nprot q_beg-q_end\n#{query_fasta[final_hit.q_beg..final_hit.q_end].translate}"
238
-
64
+ return atg_status, tmp_prot
239
65
  end
240
66
 
241
67
 
242
- def error_log(q, seq, warnings, db_name)
243
- # seq.annotate(:error,"#{q.query_def}\t#{warnings}\t#{q.hits[0].definition}")
244
-
245
- if (db_name =~ /^tr_/)
246
- if (seq.get_annotations(:tmp_annotation).empty?)
247
- if (seq.sec_desc.empty?)
248
- if (!q.hits[0].definition.nil?)
249
- warnings = "Coding sequence with some errors, #{warnings}"
250
- seq.sec_desc = "#{q.query_def}\t#{seq.fasta_length}\t#{q.hits[0].acc}\t#{db_name}\tMisassembled\t\t#{q.hits[0].e_val}\t#{q.hits[0].ident}\t\t#{q.hits[0].full_subject_length}\t#{warnings}\t\t\t\t\t\t#{q.hits[0].definition}\t"
251
- seq.annotate(:tmp_annotation,[seq.sec_desc, '','',''],true)
252
- else
253
- seq.annotate(:apply_tcode,'')
254
- end
255
- else
256
- warnings = "Coding sequence with some errors, #{warnings}"
257
- tmp_annot = seq.sec_desc.sub('my_warning',"#{warnings}")
258
- seq.annotate(:tmp_annotation,[tmp_annot, '','',''],true)
259
- end
260
- else
261
- save_last_db_annotations(seq)
262
- end
263
- else
264
- if (seq.sec_desc.empty?)
265
- if (!q.hits[0].definition.nil?)
266
- warnings = "Coding sequence with some errors, #{warnings}"
267
- seq.sec_desc = "#{q.query_def}\t#{seq.fasta_length}\t#{q.hits[0].acc}\t#{db_name}\tMisassembled\t\t#{q.hits[0].e_val}\t#{q.hits[0].ident}\t\t#{q.hits[0].full_subject_length}\t#{warnings}\t\t\t\t\t\t#{q.hits[0].definition}\t"
268
- end
269
- end
270
- end
271
-
272
- end
273
-
274
-
275
- def save_last_db_annotations(seq)
276
-
277
- # puts "sequence not complete! recovering annotations from previous database! sldba!!"
278
- (q, final_hit, final_prot, query_fasta, final_status) = seq.get_annotations(:tmp_annotation).first[:message][3]
279
- print_nt_seqs(seq, q, final_hit, final_prot, query_fasta, final_status) # funcion para marcar ATG (_-_) y STOP (___)
280
-
281
- (name,fasta_length,acc,db_name,final_status,testcode,e_val,ident,my_length,subject_length,warnings,q_frame,q_beg,q_end,s_beg,s_end,description,final_prot) = seq.get_annotations(:tmp_annotation).first[:message][0].split("\t")
282
- if (final_hit.reversed)
283
- (kk, q_frame, q_end, q_beg) = reverse_seq(query_fasta, q_frame.to_i, q_beg.to_i, q_end.to_i)
284
- end
285
-
286
- seq.annotate(:protein,seq.get_annotations(:tmp_annotation).first[:message][1])
287
- seq.annotate(:alignment,seq.get_annotations(:tmp_annotation).first[:message][2])
288
- tmp_annot = "#{name}\t#{fasta_length}\t#{acc}\t#{db_name}\t#{final_status}\t\t#{e_val}\t#{ident}\t#{my_length}\t#{subject_length}\t#{warnings}\t#{q_frame}\t#{q_beg}\t#{q_end}\t#{s_beg}\t#{s_end}\t#{description}\t#{final_prot}"
289
- seq.annotate(:tmp_annotation,[tmp_annot, '','',''],true)
290
-
291
- end
292
-
293
-
294
- def find_start(subject_start, substring, fiable, aas_n_end)
295
-
296
- tmp_prot = ''
297
- msgs = ''
298
- atg_status = 'incomplete' # complete, incomplete or putative
299
-
300
- # puts "\nsubstring (#{substring.length} aas):\n#{substring}"
301
- stop_codon = substring.rindex('*')
302
-
303
- # marcamos la distancia al s_beg desde el principio del substring
304
- # s_beg_distance = (substring.length) - subject_start
305
- s_beg_distance = (substring.length - 10) - subject_start
306
- # marcamos la distancia al s_beg desde el final del substring
307
- atg_distance = (subject_start + 1) - (substring.length - 10)
308
- if (atg_distance <= 0)
309
- atg_distance = 0
310
- else
311
- # puts "expected atg_distance = 0, your sequence atg_distance = #{atg_distance}; limit (1-15)"
312
- msgs = "atg_distance in limit (1-15): atg_distance = #{atg_distance}, "
313
- end
314
-
315
- # puts "s_beg_distance:#{s_beg_distance}, stop_codon: #{stop_codon}, subject_start: #{subject_start + 1}, atg_distance: #{atg_distance}"
316
- #----------------------------------------------------------------------------------------------------------
317
- # tenemos un codon de parada en el substring 5 prima
318
- if (stop_codon)
319
- stop_codon += 1
320
- # ahora vamos a ver si el stop esta antes o despues del s_beg
321
- if (stop_codon <= s_beg_distance) # esta antes
322
- substring = substring[stop_codon, substring.length - stop_codon]
323
- # puts "\nhay un codon de parada en el substring (#{substring.length} aas)\tstop_codon:#{stop_codon +1}\n#{substring}\n\n"
324
-
325
- first_m = substring.index('M')
326
-
327
- if (first_m) # tenemos M y stop ---------------------------------------------------------------------------
328
- substring = substring[first_m, substring.length - first_m]
329
-
68
+ def find_start(subject_start, amine_seq, distance)
69
+ atg_status = 'putative' # complete, incomplete or putative
70
+ stop_codon = amine_seq.rindex('*')
71
+ if !stop_codon.nil? # tenemos un codon de parada en el amine_seq 5 prima
72
+ _5prime_UTR = amine_seq.length - 10 - subject_start # marcamos la distancia al s_beg desde el principio del amine_seq
73
+ amine_seq = amine_seq[stop_codon + 1 .. amine_seq.length - 1]
74
+ first_m = amine_seq.index('M')
75
+ if stop_codon <= _5prime_UTR # Ver si stop está en zona 5 prima UTR
76
+ if first_m # tenemos M
77
+ amine_seq = amine_seq[first_m .. amine_seq.length - 1]
330
78
  atg_status = 'complete'
331
- else # con STOP pero sin M --------------------------------------------------------------------------------
332
- atg_status = 'putative'
333
- # puts "there is not a start codon near the expected beginning of your sequence, distance to subject ATG= #{atg_distance} aas --> good simil: #{fiable}"
334
- msgs += "W1: There is no M at the beginning, "
79
+ else # con STOP pero sin M
80
+ $global_warnings << 'noM1'
335
81
  end
336
- #----------------------------------------------------------------------------------------------------------
337
82
  else # esta despues, un cambio de fase impide analizar el principio
338
- substring = substring[stop_codon, substring.length - stop_codon] # comentar?
339
- first_m = substring.index('M') # comentar?
340
- if (first_m) # tenemos M y unexpected stop # comentar?
341
- substring = substring[first_m, substring.length - first_m] # comentar?
342
- end # comentar?
343
- # TODO esto se puede cambiar!
344
- atg_status = 'putative'
345
- msgs += " Unexpected STOP codon in 5 prime region, "
346
- # puts "\nhay un codon de parada inesperado en el substring (#{substring.length} aas)\tstop_codon:#{stop_codon}, s_beg_distance: #{s_beg_distance +1}, atg_distance: #{atg_distance}"
83
+ $global_warnings << 'UnexpSTOP5p'
84
+ amine_seq = amine_seq[first_m .. amine_seq.length - 1] if first_m # tenemos M
347
85
  end
348
- #---------------------------------------------------------------------------------------------------------------
349
86
  else # no hay stop codon
350
- first_m = substring.index('M')
351
- if (first_m) # tenemos M, sin stop
352
- m_distance = subject_start - (substring.length - 10 - first_m)
353
- substring = substring[first_m, substring.length - first_m]
354
- # m_distance = [first_m+1,s_beg_distance].max - [first_m+1,s_beg_distance].min
355
-
356
- if (m_distance > aas_n_end*2) # sin STOP, con atg pero muy lejos del inicio que marca el subject ---------------
357
- # puts "No stop codon before M and M found is too far from subject M, distance to subject ATG= #{m_distance} aas --> good simil: #{fiable}"
358
- msgs += "No stop codon before M and M found is too far from subject M, "
87
+ first_m = amine_seq.index('M')
88
+ if first_m # tenemos M
89
+ amine_seq = amine_seq[first_m .. amine_seq.length - 1]
90
+ m_distance = (subject_start - amine_seq.length).abs - 10
91
+ if m_distance.abs > distance*2 # con atg pero muy lejos del inicio que marca el subject
92
+ $global_warnings << 'NoStopMfar'
359
93
  atg_status = 'incomplete'
360
- else
361
- if (fiable) # Tenemos M y aunque no hay STOP condon el ortologo es fiable ----------------------------------
362
- # msgs += "No stop codon before M but high homology subject, "
363
- atg_status = 'complete'
364
- else # Tenemos M pero no tenemos stop y el ortologo no es fiable -------------------------------------------
365
- # puts "No stop codon before M and low homology subject, distance to subject ATG= #{m_distance} aas --> good simil: #{fiable}"
366
- msgs += "No stop codon before M and low homology subject, "
367
- atg_status = 'putative'
368
- end
94
+ else # Tenemos M
95
+ atg_status = 'complete'
369
96
  end
370
- else # sin M ni STOP -------------------------------------------------------------------------------------------
371
- atg_status = 'putative'
372
- # puts "your sequence has the subject beginning but there is not start codon at the beginning, distance to subject ATG= #{atg_distance} aas --> good simil: #{fiable}"
373
- msgs += "W2: There is no M at the beginning, "
97
+ else # sin M
98
+ $global_warnings << 'noM2'
374
99
  end
375
100
  end
376
-
377
- return [substring, atg_status, msgs]
378
-
101
+ return amine_seq, atg_status
379
102
  end
380
103
 
381
104
 
382
- def find_end(final_hit, q, full_prot, tmp_prot, end_status, warnings, aas_n_end)
383
- # aqui vemos lo que queda sin similitud hasta el final
384
- s_end_resto = (final_hit.full_subject_length - (final_hit.s_end.to_i + 1)) # en el subject, numero de aas que necesito cubrir
385
- q_end_resto = (q.full_query_length.to_i - final_hit.q_end.to_i)/3 # en el query, numero de aas que tengo
386
- sq_end_distance = q_end_resto - s_end_resto
387
-
388
- cut_in_5p = full_prot.length - tmp_prot.length
389
-
390
- resto_substring = tmp_prot[0..final_hit.q_end/3 - cut_in_5p - 16]
391
- end_substring = tmp_prot[final_hit.q_end/3 - cut_in_5p - 15..tmp_prot.length]
392
- putative_end = end_substring.index('*')
393
-
394
- # si no tenemos suficiente secuencia para tener el stop (nos faltan 15 aas o mas)
395
- if (sq_end_distance + aas_n_end < 0)
105
+ def find_end(final_hit, max_distance, tmp_prot, query_fasta)
106
+ frame_shift = check_frame_shift(final_hit)
107
+ beg_end_string =(final_hit.q_end-final_hit.q_beg)/3 - max_distance # Begin of terminal region (Coordinate) in tmp_prot
108
+ atg_substring = tmp_prot[0..beg_end_string] # prot without terminal region
109
+ end_substring = tmp_prot[beg_end_string + 1 ..tmp_prot.length-1] #Take 3' of unigen
110
+ #puts "\e[32m\nfinal_hit.q_end-final_hit.q_beg: #{final_hit.q_end-final_hit.q_beg} /3 - max_distance: #{max_distance}\e[0m"
111
+ #puts "\e[33mbeg_end_string: #{beg_end_string}\e[0m"
112
+ #puts "\e[35mtmp_prot.length: #{tmp_prot.length}\e[0m"
113
+ if beg_end_string < 0 || end_substring.nil? #Sequences whose homology is at end of it and dont't exits the 3' part of unigene
114
+ atg_substring = tmp_prot
115
+ end_substring = ''
396
116
  end_status = 'incomplete'
397
- if (putative_end)
398
- warnings += " Unexpected STOP codon at 3' end. Distance to subject end: #{sq_end_distance.abs} aas, "
399
- end_substring = end_substring[0, putative_end+1] # comentar?
400
- # if (@verbose)
401
- # puts "#{db_name} -- #{q.query_def} --> Unexpected STOP codon at 3' end. Distance to subject end: #{sq_end_distance.abs} aas"
402
- # end
403
- else
404
- warnings += "Distance to subject end: #{sq_end_distance.abs} aas, "
405
- # if (@verbose)
406
- # puts "#{db_name} -- #{q.query_def} --> Distance to subject end: #{sq_end_distance.abs} aas"
407
- # end
408
- end
117
+ else
118
+ end_status = 'putative'
119
+ putative_end = end_substring.index('*')
120
+ end_substring = end_substring[0 .. putative_end] if putative_end
121
+
122
+ s_end_resto = final_hit.s_len - (final_hit.s_end + 1) # en el subject, numero de aas que necesito cubrir
123
+ q_end_resto = (query_fasta.length - final_hit.q_end)/3 # en el query, numero de aas que tengo
124
+ sq_end_distance = q_end_resto - s_end_resto # La diferencia se hace a partir del final del hit para que el calculo no quede sesgado en caso de que la secuecia este truncada por 5'
409
125
 
410
- else # tenemos suficiente secuencia
411
- if (putative_end) # tenemos un stop
412
- q_stop_resto = (putative_end - 15) # distancia entre el stop y el q_end, si es negativo el stop esta antes del q_end
413
- qs_stop_distance = q_stop_resto - s_end_resto # distancia entre los stops del q y el s
414
-
415
- # puts "putative_end: #{putative_end}, q_stop_resto: #{q_stop_resto}, qs_stop_distance: #{qs_stop_distance}"
416
-
417
- if (qs_stop_distance + aas_n_end >= 0) # si q_end esta a menos de 15 aas antes o esta despues del s_end; complete
418
- end_status = 'complete'
419
- elsif (qs_stop_distance + 2*aas_n_end < 0) # si q_end es mas de 30 aas menor que el s_end; putative/Putative chimeric seq
420
- end_status = 'putative'
421
- warnings += " query STOP codon too far from subject stop. Distance to subject end: #{qs_stop_distance.abs} aas, putative chimeric sequence, "
422
- # if (@verbose)
423
- # puts "#{db_name} -- #{q.query_def} --> query STOP too far from subject stop. Distance to subject end: #{qs_stop_distance.abs} aas, putative chimeric sequence"
424
- # end
425
- elsif (qs_stop_distance + aas_n_end < 0) # si q_end es mas de 15 aas menor pero menos de 30 que el s_end; putative
426
- end_status = 'putative'
427
- warnings += " query STOP codon is far from subject stop. Distance to subject end: #{qs_stop_distance.abs} aas, "
428
- # if (@verbose)
429
- # puts "#{db_name} -- #{q.query_def} --> query STOP far from subject stop. Distance to subject end: #{qs_stop_distance.abs} aas"
430
- # end
126
+ if (final_hit.align_len == final_hit.s_len && putative_end)||(sq_end_distance.abs <= max_distance && putative_end && putative_end <= max_distance*2) #Stop in a Full-length. max_distance *2 is set by de margin of +-15aa at the end of aligment
127
+ end_status = 'complete'
128
+ elsif sq_end_distance < max_distance # si no tenemos suficiente secuencia para tener el stop (nos faltan 15 aas o mas)
129
+ end_status = 'incomplete'
130
+ if putative_end
131
+ $global_warnings << ['UnexpSTOP3pDist', sq_end_distance.abs]
132
+ else
133
+ $global_warnings << ['DistSubj', sq_end_distance.abs]
134
+ end
135
+ else # tenemos suficiente secuencia
136
+ if putative_end # tenemos un stop
137
+ #beg_end_string indica en que punto del unigen se encuentra el area de busqueda del codon stop
138
+ stop_q_s = beg_end_string + putative_end - final_hit.s_len # Space between query's stop and subject's stop
139
+ if stop_q_s.abs <= max_distance #Stop codon is in search region
140
+ end_status = 'complete'
141
+ elsif stop_q_s < 0
142
+ $global_warnings << 'UnexpSTOP3p'
143
+ elsif stop_q_s > 0
144
+ end_status = 'complete'
145
+ $global_warnings << 'QueryTooLong'
146
+ end
147
+ else # no tenemos codon de parada pero tenemos suficiente secuencia
148
+ end_status = 'incomplete'
149
+ $global_warnings << 'ProtFusion'
431
150
  end
432
- end_substring = end_substring[0, putative_end+1]
433
-
434
- else # no tenemos codon de parada pero tenemos suficiente secuencia
435
- end_status = 'putative'
436
- warnings += " STOP codon was not found. Distance to subject end: #{sq_end_distance.abs} aas, "
437
- # if (@verbose)
438
- # puts "#{db_name} -- #{q.query_def} --> STOP codon was not found. Distance to subject end: #{sq_end_distance.abs} aas"
439
- # end
440
151
  end
441
-
442
152
  end
443
-
444
- return [resto_substring, end_substring, end_status, warnings, putative_end]
153
+ final_prot = atg_substring + end_substring
154
+ end_status = 'complete' if final_prot.length == final_hit.s_len+1 && final_prot[final_prot.length-1] == '*'
155
+ new_q_end = final_hit.q_beg-1 + final_prot.length * 3 + frame_shift
156
+ modify_3p_align(new_q_end, final_hit, query_fasta, final_prot) if $verbose > 1
157
+ final_hit.q_end = new_q_end
158
+ return end_status, final_prot
445
159
  end
446
160
 
447
161
 
448
- def determine_status(atg_status,end_status)
449
-
450
- if (atg_status == 'complete') && (end_status == 'complete') # proteina completa
451
- final_status = 'Complete'
452
- elsif (atg_status == 'putative' && end_status == 'complete') || (atg_status == 'complete' && end_status == 'putative') || (atg_status == 'putative' && end_status == 'putative') # comienzo y/o final putative
453
- final_status = 'Putative Complete'
454
- elsif (atg_status == 'incomplete') && (end_status == 'incomplete') # region intermedia
455
- final_status = 'Internal'
456
- elsif (atg_status == 'complete') && (end_status == 'incomplete') # tenemos el principio de la proteina
457
- final_status = 'N-terminus'
458
- elsif (atg_status == 'putative') && (end_status == 'incomplete') # puede que tengamos el principio de la proteina
459
- final_status = 'Putative N-terminus'
460
- elsif (atg_status == 'incomplete') && (end_status == 'complete') # tenemos el final de la proteina
461
- final_status = 'C-terminus'
462
- elsif (atg_status == 'incomplete') && (end_status == 'putative') # puede que tengamos el final de la proteina
463
- final_status = 'Putative C-terminus'
162
+ def determine_status(atg_status, end_status)
163
+ if atg_status != 'incomplete' && end_status != 'incomplete' # proteina completa
164
+ type = COMPLETE
165
+ elsif atg_status == 'incomplete' && end_status == 'incomplete' # region intermedia
166
+ type = INTERNAL
167
+ elsif atg_status != 'incomplete' && end_status == 'incomplete' # tenemos el principio de la proteina
168
+ type = N_TERMINAL
169
+ elsif atg_status == 'incomplete' && end_status != 'incomplete' # tenemos el final de la proteina
170
+ type = C_TERMINAL
464
171
  end
465
172
 
466
- return final_status
467
- end
468
-
469
-
470
- def print_annotations(seq, q, final_hit, final_status, final_prot, warnings, query_fasta, db_name)
471
- name_diff = q.query_def.length - final_hit.acc.length
472
- if (name_diff > 0)
473
- spnum = ' '*name_diff.to_i
173
+ if atg_status == 'putative' || end_status == 'putative'
174
+ status = FALSE # Putative
474
175
  else
475
- spnum = ''
176
+ status = TRUE # Sure
476
177
  end
477
- #-------------------------------------------------------------------------------------------------------------------------------------
478
- # if the sequence is Complete will be printed --------------------------------------------------------------------
479
- if (final_status == 'Complete')
480
- seq.annotate(:protein,">#{q.query_def}\n#{final_prot}")
481
- print_nt_seqs(seq, q, final_hit, final_prot, query_fasta, final_status) # funcion para marcar ATG (_-_) y STOP (___)
482
-
483
- if (final_hit.reversed)
484
- (kk, final_hit.q_frame, final_hit.q_end, final_hit.q_beg) = reverse_seq(seq.seq_fasta, final_hit.q_frame.to_i, final_hit.q_beg.to_i, final_hit.q_end.to_i)
485
- end
486
- seq.annotate(:complete,"#{q.query_def}\t#{query_fasta.length}\t#{final_hit.acc}\t#{db_name}\t#{final_status}\t\t#{final_hit.e_val}\t#{final_hit.ident}\t#{final_prot.length}\t#{final_hit.full_subject_length}\t#{warnings}\t#{final_hit.q_frame}\t#{final_hit.q_beg.to_i + 1}\t#{final_hit.q_end.to_i + 1}\t#{final_hit.s_beg.to_i + 1}\t#{final_hit.s_end.to_i + 1}\t#{final_hit.definition}\t#{final_prot}")
487
- seq.annotate(:alignment,"#{q.query_def}\t#{final_hit.q_seq}\n#{final_hit.acc}#{spnum}\t#{final_hit.s_seq}\n\n")
488
- #-------------------------------------------------------------------------------------------------------------------------------------
489
- else # la proteina no esta completa -------------------------------------------------------------------------
490
- if (!seq.get_annotations(:tmp_annotation).empty?) && (!seq.get_annotations(:tmp_annotation).nil?) # ---> trae informacion de una bd anterior
491
- if (db_name =~/^tr_/) # ---> estamos usando el trembl, se dejan las anotaciones que trae
492
- # puts "#{db_name} -- #{q.query_def} --> print_annotations: sequence not complete! recovering annotations from previous database!"
493
- (kk1, final_hit, final_prot, query_fasta, final_status) = seq.get_annotations(:tmp_annotation).first[:message][3]
494
- print_nt_seqs(seq, q, final_hit, final_prot, query_fasta, final_status) # funcion para marcar ATG (_-_) y STOP (___)
495
178
 
496
- (name,fasta_length,acc,db_name,final_status,testcode,e_val,ident,my_length,subject_length,warnings,q_frame,q_beg,q_end,s_beg,s_end,description,final_prot) = seq.get_annotations(:tmp_annotation).first[:message][0].split("\t")
497
- if (final_hit.reversed)
498
- (kk, q_frame, q_end, q_beg) = reverse_seq(query_fasta, q_frame.to_i, q_beg.to_i, q_end.to_i)
499
- end
500
-
501
- my_prot = seq.get_annotations(:tmp_annotation).first[:message][1]
502
- seq.annotate(:protein,my_prot)
503
- my_align = seq.get_annotations(:tmp_annotation).first[:message][2]
504
- seq.annotate(:alignment,my_align)
505
-
506
- tmp_annot = "#{name}\t#{query_fasta.length}\t#{acc}\t#{db_name}\t#{final_status}\t\t#{e_val}\t#{ident}\t#{my_length}\t#{subject_length}\t#{warnings}\t#{q_frame}\t#{q_beg}\t#{q_end}\t#{s_beg}\t#{s_end}\t#{description}\t#{final_prot}"
507
- seq.annotate(:tmp_annotation,[tmp_annot, '','',''],true)
508
- #-----------------------------------------------------------------------------------------------------------------------------
509
- # elsif (db_name =~ /^sp_/) # ---> estamos usando el sp, se dejan las anotaciones que trae
510
-
511
- # puts "#{db_name} -- #{q.query_def} --> print_annotations: Mantenemos las anotaciones de la BD de usuario y pasamos la secuencia al trembl"
512
- end
513
- #-------------------------------------------------------------------------------------------------------------------------------------
514
- elsif (seq.get_annotations(:tmp_annotation).empty?) # ---> NO trae informacion de una bd anterior
515
- if (db_name =~ /^tr_/) # ---> estamos usando el trembl
516
- # puts "#{db_name} -- #{q.query_def} --> print_annotations: #{q.query_def} is not complete!! se anota con trembl"
517
- print_nt_seqs(seq, q, final_hit, final_prot, query_fasta, final_status) # funcion para marcar ATG (_-_) y STOP (___)
179
+ return type, status
180
+ end
518
181
 
519
- if (final_hit.reversed)
520
- (kk, final_hit.q_frame, final_hit.q_end, final_hit.q_beg) = reverse_seq(seq.seq_fasta, final_hit.q_frame.to_i, final_hit.q_beg.to_i, final_hit.q_end.to_i)
521
- end
522
182
 
523
- seq.annotate(:alignment,"#{q.query_def}\t#{final_hit.q_seq}\n#{final_hit.acc}#{spnum}\t#{final_hit.s_seq}\n\n")
524
- seq.annotate(:protein,">#{q.query_def}\n#{final_prot}")
525
- tmp_annot = "#{q.query_def}\t#{query_fasta.length}\t#{final_hit.acc}\t#{db_name}\t#{final_status}\t\t#{final_hit.e_val}\t#{final_hit.ident}\t#{final_prot.length}\t#{final_hit.full_subject_length}\t#{warnings}\t#{final_hit.q_frame}\t#{final_hit.q_beg.to_i + 1}\t#{final_hit.q_end.to_i + 1}\t#{final_hit.s_beg.to_i + 1}\t#{final_hit.s_end.to_i + 1}\t#{final_hit.definition}\t#{final_prot}"
526
- seq.annotate(:tmp_annotation,[tmp_annot, '','',''])
527
- #-------------------------------------------------------------------------------------------------------------------------------------
528
- else # cargamos anotaciones para la siguiente BD
529
- tmp_prot = ">#{q.query_def}\n#{final_prot}"
530
- tmp_align = "#{q.query_def}\t#{final_hit.q_seq}\n#{final_hit.acc}#{spnum}\t#{final_hit.s_seq}\n\n"
531
- tmp_annot = "#{q.query_def}\t#{query_fasta.length}\t#{final_hit.acc}\t#{db_name}\t#{final_status}\t\t#{final_hit.e_val}\t#{final_hit.ident}\t#{final_prot.length}\t#{final_hit.full_subject_length}\t#{warnings}\t#{final_hit.q_frame}\t#{final_hit.q_beg.to_i + 1}\t#{final_hit.q_end.to_i + 1}\t#{final_hit.s_beg.to_i + 1}\t#{final_hit.s_end.to_i + 1}\t#{final_hit.definition}\t#{final_prot}"
532
- seq.sec_desc = "#{q.query_def}\t#{query_fasta.length}\t#{final_hit.acc}\t#{db_name}\tMisassembled\t\t#{final_hit.e_val}\t#{final_hit.ident}\t\t#{final_hit.full_subject_length}\t#{warnings}\t\t\t\t\t\t#{final_hit.definition}\t"
533
- seq.annotate(:tmp_annotation,[tmp_annot, tmp_prot,tmp_align,[q, final_hit, final_prot, query_fasta, final_status]])
534
-
535
- # puts "\n\n\n-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.---#{q.query_def}\t#{final_status}\n#{tmp_prot}"
536
- # puts "#{db_name} -- #{q.query_def} --> print_annotations: cargamos anotaciones para utilizarlas en la siguiente BD"
183
+ def compare_seq_length_with_subject(final_prot, distance, final_hit, type, status)
184
+ if final_prot.length - 2 * distance > final_hit.s_len
185
+ $global_warnings << ['SeqLonger', final_prot.length, final_hit.s_len]
186
+ elsif final_prot.length + 2 * distance < final_hit.s_len
187
+ $global_warnings << ['SeqShorter', final_prot.length, final_hit.s_len]
188
+ if final_prot.length + 100 < final_hit.s_len || final_prot.length*2 < final_hit.s_len
189
+ if type == COMPLETE
190
+ status = FALSE
191
+ $global_warnings << 'VeryShorter'
537
192
  end
538
193
  end
539
194
  end
195
+ return status
540
196
  end
541
197
 
542
198
 
543
- def print_nt_seqs(seq, q, final_hit, final_prot, query_fasta, final_status)
544
-
545
- bad_atg = false
546
- #------------------------------------------------------------------------------------------------------------- ATG
547
-
548
- if (final_status == 'Complete') || (final_status == 'Putative Complete') || (final_status == 'Putative N-terminus') || (final_status == 'N-terminus')
549
- # puts "entra aqui, final_status: #{final_status}"
550
- my_seq_n = query_fasta[final_hit.q_beg - 5..final_hit.q_beg + 5]
551
-
552
- beg5 = false
553
- # ------------------------------------- si my_seq_n = nil puede ser porque q_beg sea < 5
554
- if (final_hit.q_beg < 6)
555
- my_seq_n = query_fasta[0..10]
556
- beg5 = true
557
- # puts "empieza en el borde de la seq"
199
+ def save_annotations(seq, final_hit, type, status, final_prot, query_fasta, db_name)
200
+ # if the sequence is Complete or it hasn't previous info will be saved
201
+ if seq.type == UNKNOWN || (type == COMPLETE && seq.type != COMPLETE)
202
+ seq.type = type
203
+ seq.status = status
204
+ seq.db_name = db_name
205
+ seq.seq_fasta = query_fasta
206
+ seq.seq_aa = final_prot
207
+ seq.hit = final_hit
208
+ seq.warnings($global_warnings)
209
+ $global_warnings = [] # Clean all warnings for current sequence
210
+ seq.seq_nt = mark_nt_seqs(final_hit, query_fasta)
211
+ if type == COMPLETE
212
+ seq.ignore = TRUE
558
213
  end
559
-
560
- atg_found = my_seq_n.index(/ATG/i)
561
- atg_found_rv = my_seq_n.rindex(/ATG/i)
562
- my_atg_index = nil
563
214
  end
564
-
565
- if (!atg_found.nil?)
566
- if (beg5)
567
-
568
- my_seq_n.sub!(/ATG/i,'_-_ATG')
569
- my_atg_index = atg_found
570
- my_seq = my_seq_n + query_fasta[11..query_fasta.length + 1]
571
-
572
- elsif (atg_found == atg_found_rv)
573
-
574
- my_seq_n.sub!(/ATG/i,'_-_ATG')
575
- my_atg_index = final_hit.q_beg - 5 + atg_found
576
-
577
- my_seq = query_fasta[0..final_hit.q_beg - 6] + my_seq_n + query_fasta[final_hit.q_beg + 6..query_fasta.length + 1]
578
-
579
- # puts "my_seq despues de encontrar el atg: #{my_seq}"
580
- elsif (atg_found == 5) || (atg_found_rv == 5)
581
-
582
- my_seq_n = my_seq_n[0..4]+'_-_'+my_seq_n[5..10]
583
- my_atg_index = final_hit.q_beg - 5 + atg_found
584
- my_seq = query_fasta[0..final_hit.q_beg - 6] + my_seq_n + query_fasta[final_hit.q_beg + 6..query_fasta.length + 1]
585
-
586
- else
587
-
588
- # puts "#{q.query_def} tiene mas de un ATG my_seq_n: #{my_seq_n}"
589
- bad_atg = true
590
- my_seq = query_fasta
591
- end
592
-
593
- else
594
-
595
- bad_atg = true
596
- # puts "#{q.query_def} NO TIENE ATG my_seq_n: #{my_seq_n}"
597
- my_seq = query_fasta
598
-
215
+ if $verbose > 2
216
+ puts "\e[1mStruct annot: #{seq.prot_annot_calification}\e[0m"
599
217
  end
600
- #------------------------------------------------------------------------------------------------------------- STOP
601
- stop_c = nil
602
- if (final_status == 'Complete') || (final_status == 'Putative Complete') || (final_status == 'C-terminus') || (final_status == 'Putative C-terminus')
218
+ end
603
219
 
604
- if (bad_atg == true)
605
- stop_c = my_seq[final_hit.q_end - 2..final_hit.q_end]
606
- stop_c_longer = my_seq[final_hit.q_end - 7..final_hit.q_end + 5]
607
- else
608
- stop_c = my_seq[final_hit.q_end + 3..final_hit.q_end + 5]
609
- stop_c_longer = my_seq[final_hit.q_end - 2..final_hit.q_end + 10]
610
- end
611
220
 
221
+ def mark_nt_seqs(final_hit, query_fasta)
222
+ atg = query_fasta[final_hit.q_beg..final_hit.q_beg + 2]
223
+ mark_atg = nil
224
+ if atg == 'ATG'
225
+ mark_atg = '_-_'
612
226
  end
227
+ stop = query_fasta[final_hit.q_end - 2..final_hit.q_end]
228
+ mark_stop = nil
229
+ if stop == 'TAG' || stop == 'TGA' || stop == 'TAA'
230
+ mark_stop = '___'
231
+ end
232
+ seq5p = query_fasta[0..final_hit.q_beg-1]
233
+ orf = query_fasta[final_hit.q_beg..final_hit.q_end]
234
+ seq3p = query_fasta[final_hit.q_end..query_fasta.length]
235
+ nt_seq = "#{seq5p}#{mark_atg}#{orf}#{mark_stop}#{seq3p}"
236
+ return nt_seq
237
+ end
613
238
 
614
- if (!stop_c.nil?)
615
- # puts stop_c
616
- # puts stop_c_longer
617
- if (stop_c.translate == '*')
618
-
619
- if (bad_atg == true)
620
- my_seq = my_seq[0..final_hit.q_end] +'___'+ my_seq[final_hit.q_end + 1..my_seq.length + 1]
621
- seq.annotate(:nucleotide,"#{q.query_def}\t#{final_status}\tNO ATG\t\t#{stop_c}\t#{final_hit.q_beg + 1}\t#{final_hit.q_end + 1}\t#{my_seq}")
622
- else
623
-
624
- my_seq = my_seq[0..final_hit.q_end + 5] +'___'+ my_seq[final_hit.q_end + 6..my_seq.length + 1]
625
- my_prot = my_seq.sub(/\w+_\-_/,'')
626
- my_prot = my_prot.sub(/___\w+/,'')
627
- my_prot = my_prot.translate
628
- my_prot = my_prot.sub(/x$/,'')
629
-
630
- simliar_fragment = final_prot.lcs(my_prot)
631
-
632
- if (simliar_fragment.length == final_prot.length) && (simliar_fragment.length == my_prot.length)
633
- seq.annotate(:nucleotide,"#{q.query_def}\t#{final_status}\t\t\t\t\t\t#{my_seq}")
634
- else
635
- seq.annotate(:nucleotide,"#{q.query_def}\t#{final_status}\tthe nucleotide sequence contain a lot of errors\tstop: #{stop_c_longer}\t#{stop_c}\t#{final_hit.q_beg + 1}\t#{final_hit.q_end + 1}\t#{my_seq}")
636
- # puts "nt seq: was no possible to find stop codon, the nucleotide sequence contain a lot of errors"
239
+ def exonerate_fix_frame_shift(query_fasta, hit)
240
+ frame_shifts = []
241
+ added_nts = 0
242
+ hit.each_with_index do |hsp, num|
243
+ if hsp.class.to_s == 'ExoBlastHit' #Only this type of class of BlastHit has frameshift attributes
244
+ if !hsp.q_frameshift.empty? #There is frameshift
245
+ hsp.q_frameshift.each do |position, num_nts|
246
+ local_add = 3 - num_nts
247
+ fs_final_position = position + num_nts
248
+ $global_warnings << ['ExFrameS', fs_final_position]
249
+ frame_shifts << [fs_final_position, local_add]
250
+ added_nts += local_add
637
251
  end
638
-
639
252
  end
253
+ end
254
+ hsp.q_beg += added_nts if num > 0
255
+ hsp.q_end += added_nts
256
+ end
257
+ add = 0
258
+ frame_shifts.each do |position, num_nts|
259
+ query_fasta = query_fasta.insert(position+add, 'n'*num_nts)
260
+ add += num_nts
261
+ end
262
+ return query_fasta
263
+ end
640
264
 
641
- else
642
- if (final_status == 'Putative Complete') || (final_status == 'C-terminus') || (final_status == 'Putative C-terminus')
643
265
 
644
- if (bad_atg == true)
645
- stop_c = my_seq[final_hit.q_end+1..final_hit.q_end+3]
646
- stop_c_longer = my_seq[final_hit.q_end - 4..final_hit.q_end + 8]
647
- else
648
- stop_c = my_seq[final_hit.q_end + 7..final_hit.q_end + 9]
649
- stop_c_longer = my_seq[final_hit.q_end..final_hit.q_end + 13]
650
- end
651
-
652
- if (!stop_c.nil?)
653
- if (stop_c.translate == '*')
654
- final_hit.q_end = final_hit.q_end + 3
655
- if (bad_atg == true)
656
- my_seq = my_seq[0..final_hit.q_end] +'___'+ my_seq[final_hit.q_end + 1..my_seq.length + 1]
657
- seq.annotate(:nucleotide,"#{q.query_def}\t#{final_status}\tNO ATG\t\t#{stop_c}\t#{final_hit.q_beg + 1}\t#{final_hit.q_end + 1}\t#{my_seq}")
658
- else
659
- seq.annotate(:nucleotide,"#{q.query_def}\t#{final_status}\tNO STOP exacto\tstop: #{stop_c_longer}\t#{stop_c}\t#{final_hit.q_beg + 1}\t#{final_hit.q_end + 1}\t#{my_seq}")
660
- end
661
- else
662
- if (bad_atg == true)
663
- seq.annotate(:nucleotide,"#{q.query_def}\t#{final_status}\tNO ATG NO STOP exacto\tstop: #{stop_c_longer}\t#{stop_c}\t#{final_hit.q_beg + 1}\t#{final_hit.q_end + 1}\t#{my_seq}")
664
- # puts "find nt end: NO ATG, NO exact STOP"
665
- else
666
- seq.annotate(:nucleotide,"#{q.query_def}\t#{final_status}\tNO STOP exacto\tstop: #{stop_c_longer}\t#{stop_c}\t#{final_hit.q_beg + 1}\t#{final_hit.q_end + 1}\t#{my_seq}")
667
- # puts "find nt end: GOOD ATG, NO exact STOP"
668
- end
669
- end
670
- end
671
- end
672
-
266
+ ## VERBOSE METHODS
267
+ def show_nts
268
+ show = FALSE
269
+ show = TRUE if $verbose && $verbose > 3
270
+ return show
271
+ end
673
272
 
674
- end
675
273
 
676
- else
274
+ def modify_3p_align(new_q_end, final_hit, query_fasta, final_prot) ## For visual report
275
+ if new_q_end > final_hit.q_end #There is an align extension
276
+ extend_align = query_fasta[final_hit.q_end+1 .. new_q_end].translate
277
+ final_hit.q_seq = final_hit.q_seq + extend_align
278
+ elsif new_q_end < final_hit.q_end #The align is cutted
279
+ upper_limit = final_prot.length - 1 + final_hit.q_seq.count('-')
280
+ final_hit.q_seq = final_hit.q_seq[0 .. upper_limit]
281
+ end
282
+ end
677
283
 
678
- if (bad_atg == true)
679
- seq.annotate(:nucleotide,"#{q.query_def}\t#{final_status}\tNO ATG NO STOP\t\t\t\t\t#{my_seq}")
680
- else
681
- seq.annotate(:nucleotide,"#{q.query_def}\t#{final_status}\tNO STOP\t\t\t\t\t#{my_seq}")
682
- end
683
284
 
285
+ def modify_5p_align(new_q_beg, final_hit, query_fasta) ## For visual report
286
+ if new_q_beg < final_hit.q_beg #There is an align extension
287
+ extend_align = query_fasta[new_q_beg .. final_hit.q_beg-1].translate
288
+ final_hit.q_seq = extend_align + final_hit.q_seq
289
+ elsif new_q_beg > final_hit.q_beg #The align is cut
290
+ seq_cut = (new_q_beg - final_hit.q_beg)/3
291
+ gaps = final_hit.q_seq[0..seq_cut].count('-')
292
+ seq_cut += gaps
293
+ final_hit.q_seq = final_hit.q_seq[seq_cut .. final_hit.q_seq.length-1]
684
294
  end
685
-
686
295
  end
687
296
 
688
297
  end