full_lengther_next 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,688 @@
1
+
2
+ require 'une_los_hit'
3
+
4
+ module FlAnalysis
5
+
6
+ def analiza_orf_y_fl(seq, blast_query, options, db_name)
7
+ aas_n_end = options[:distance]
8
+ pident_threshold = options[:ident]
9
+ evalue_threshold = options[:evalue]
10
+ # @verbose = options[:verbose]
11
+
12
+ # test_blast_hits(blast_query)
13
+
14
+ # used to detect if the sequence and the blast are from different query
15
+ if seq.seq_name != blast_query.query_def
16
+ raise "BLAST query name and sequence are different"
17
+ end
18
+
19
+ q=blast_query
20
+ msgs = ''
21
+ atg_status = ''
22
+ end_status = ''
23
+ final_status = ''
24
+
25
+ # the fasta sequence is saved
26
+ query_fasta = seq.seq_fasta
27
+
28
+ if q.hits[0].nil? # There is no match in blast, the seq go to the next DB
29
+ # puts "#{db_name} -- #{q.query_def} --> NO BLASTX match"
30
+
31
+ # If the DB is trembl and the seq has annotations from other DB the annotations must be printed
32
+ if (db_name =~ /^tr_/)
33
+ if (seq.get_annotations(:tmp_annotation).empty?)
34
+ if (seq.sec_desc.empty?)
35
+ seq.annotate(:tcode,'')
36
+ else
37
+ seq.annotate(:tmp_annotation,[seq.sec_desc, '','',''],true)
38
+ end
39
+ else
40
+ save_last_db_annotations(seq)
41
+ end
42
+ end
43
+
44
+ return
45
+ end
46
+ #----------------------------------------------------------------------------------------------------------
47
+ warnings = ''
48
+ errors = ''
49
+ wrong_seq = false
50
+
51
+ # if the sequence has more than one hit, the frames are checked and fixed to get an single hit
52
+ if (q.hits.count > 1)
53
+
54
+ seq_unida = UneLosHit.new(q, query_fasta, pident_threshold)
55
+
56
+ wrong_seq = seq_unida.wrong_seq
57
+ is_ok = seq_unida.is_ok
58
+ q_index_start = seq_unida.q_index_start
59
+ full_prot = seq_unida.full_prot
60
+
61
+ query_fasta = seq_unida.output_seq # repaired fasta
62
+
63
+ final_hit = seq_unida.final_hit # single hit
64
+ msgs = seq_unida.msgs # warning messages
65
+ x_number = seq_unida.number_x # number of nucleotides used to fix frame errors
66
+
67
+ else # if there is only one hit
68
+
69
+ if (q.hits[0].q_frame.to_i < 0) # si la secuencia esta al reves le damos la vuelta
70
+ (query_fasta, q.hits[0].q_frame, q.hits[0].q_beg, q.hits[0].q_end) = reverse_seq(query_fasta, q.hits[0].q_frame, q.hits[0].q_beg, q.hits[0].q_end)
71
+ q.hits[0].reversed = true
72
+ end
73
+
74
+ final_hit = q.hits[0] # single hit
75
+ x_number = 0 # number of nucleotides used to fix frame errors
76
+
77
+ full_prot = query_fasta[final_hit.q_frame-1, query_fasta.length+1].translate
78
+ (is_ok, q_index_start) = contenidos_en_prot(final_hit, full_prot, q)
79
+ end
80
+ # test_final_hit(final_hit, query_fasta)
81
+ #----------------------------------------------------------------------------------------------------------
82
+ if wrong_seq
83
+ warnings = "ERROR#1, contains sense and antisense hits!!!, putative chimeric sequence, " + warnings
84
+ # puts "ERROR#1, contains sense and antisense hits!!!, putative chimeric sequence"
85
+ errors = "#{db_name}\t#{q.hits[0].acc}\tERROR#1\tcontains sense and antisense hits!!!, putative chimeric sequence, "
86
+ error_log(q, seq, warnings, db_name)
87
+ return
88
+ end
89
+ #----------------------------------------------------------------------------------------------------------
90
+ warnings += msgs
91
+ msgs = ''
92
+ #----------------------------------------------------------------------------------------------------------
93
+ if (x_number < 0)
94
+ warnings = "ERROR#2, unexpected negative index in x_number, " + warnings
95
+ # puts "ERROR#2, unexpected negative index in x_number"
96
+ errors = "#{db_name}\t#{q.hits[0].acc}\tERROR#2\tunexpected negative index in x_number, "
97
+ error_log(q, seq, warnings, db_name)
98
+ return
99
+ end
100
+ #----------------------------------------------------------------------------------------------------------
101
+ if (!is_ok)
102
+ warnings = "ERROR#3, very serious frame error, " + warnings
103
+ # puts "#{q.query_def} ERROR#3, hit was NOT found in the protein"
104
+ errors = "#{db_name}\t#{q.hits[0].acc}\tERROR#3\thit was NOT found in the protein, "
105
+ # error_log(q, seq, warnings, db_name)
106
+ # return
107
+ end
108
+ #----------------------------------------------------------------------------------------------------------
109
+ fiable = false
110
+ if ((final_hit.ident >= pident_threshold) && (final_hit.e_val <= evalue_threshold))
111
+ fiable = true
112
+ end
113
+ # if the query protein is large enough at the start of the sequence should have the start codon
114
+ if (final_hit.q_beg/3 + aas_n_end >= final_hit.s_beg.to_i)
115
+ substring = full_prot[0, q_index_start + 10]
116
+ resto_substring = full_prot[q_index_start + 10, full_prot.length - q_index_start - 10]
117
+
118
+ # to look for the beginning of the protein
119
+ (m_substring, atg_status, msgs) = find_start(final_hit.s_beg, substring, fiable, aas_n_end)
120
+
121
+ # pasting the substring sequence with the rest of the sequence
122
+ tmp_prot = "#{m_substring}#{resto_substring}"
123
+ # to get the value of the start_ORF index
124
+ final_hit.q_beg = final_hit.q_beg.to_i - ((m_substring.length - 10) * 3)
125
+ else
126
+ # if (@verbose)
127
+ # puts "beginning too short!"
128
+ # end
129
+
130
+ atg_status = 'incomplete'
131
+ substring = full_prot[0, q_index_start]
132
+ distance_s_atg = (final_hit.s_beg.to_i - final_hit.q_beg/3) + 1
133
+
134
+ if (substring.rindex('*'))
135
+ warnings += "Unexpected stop codon in the beginning of your sequence, "
136
+ # if (@verbose)
137
+ # puts "#{db_name} -- #{q.query_def} --> Unexpected stop codon in the beginning of your sequence"
138
+ # end
139
+ end
140
+
141
+ final_hit.q_beg = final_hit.q_beg.to_i - (substring.length * 3)
142
+ tmp_prot = full_prot
143
+ end
144
+ #----------------------------------------------------------------------------------------------------------
145
+ # look for the end of the protein
146
+ (resto_substring, end_substring, end_status, warnings, putative_end) = find_end(final_hit, q, full_prot, tmp_prot, end_status, warnings, aas_n_end)
147
+ #----------------------------------------------------------------------------------------------------------
148
+ final_prot = "#{resto_substring}#{end_substring}"
149
+
150
+ warnings += msgs
151
+
152
+ # to get the value of the end_ORF index
153
+ if (atg_status == 'complete')
154
+ final_hit.q_end = final_hit.q_beg - 3 + (final_prot.length * 3)
155
+ else
156
+ if (putative_end)
157
+ final_hit.q_end = final_hit.q_end - 45 + (putative_end*3)
158
+ end
159
+ end
160
+
161
+ #--------------------------------------------------------------------------------------------------------------
162
+ # decide the sequence status (Complete, Putative Complete, Internal, N-terminus, Putative N-terminus, C-terminus)
163
+ final_status = determine_status(atg_status,end_status)
164
+ #----------------------------------------------------------------------------------------------------------
165
+ if (final_prot.length - 2*aas_n_end > final_hit.full_subject_length)
166
+ warnings += " your sequence is longer than subject: #{final_prot.length} - #{final_hit.full_subject_length}"
167
+
168
+ elsif (final_prot.length + aas_n_end < final_hit.full_subject_length)
169
+ warnings += " your sequence is shorter than subject: #{final_prot.length} - #{final_hit.full_subject_length}"
170
+ if (final_prot.length + 100 < final_hit.full_subject_length) || (final_prot.length*2 < final_hit.full_subject_length)
171
+
172
+ if (final_status == 'Complete')
173
+ final_status = 'Putative Complete'
174
+ warnings += ". Was predicted as Complete, but is very much shorter than de subject"
175
+ # if (@verbose)
176
+ # puts "#{db_name} -- #{q.query_def} --> your sequence is 100 aas shorter than the subject or shorter than the half length of the subject"
177
+ # end
178
+ end
179
+ end
180
+ end
181
+
182
+ # test_final_hit(final_hit, query_fasta)
183
+ print_annotations(seq, q, final_hit, final_status, final_prot, warnings, query_fasta, db_name)
184
+
185
+ end
186
+
187
+
188
+ def test_blast_hits(q)
189
+
190
+ puts "query_def: #{q.query_def} full_query_length: #{q.full_query_length} ------------------------------------------------"
191
+
192
+ q.hits.each do |h|
193
+ puts "\t subject_id: #{h.acc}"
194
+ puts "\t acc: #{h.acc}"
195
+ puts "\t full_subject_length: #{h.full_subject_length}"
196
+ puts "\t q_beg: #{h.q_beg + 1}"
197
+ puts "\t q_end: #{h.q_end + 1}"
198
+ puts "\t q_frame: #{h.q_frame}"
199
+ puts "\t s_beg: #{h.s_beg + 1}"
200
+ puts "\t s_end: #{h.s_end + 1}"
201
+ puts "\t s_frame: #{h.s_frame}"
202
+ puts "\t align_len: #{h.align_len}"
203
+ puts "\t gaps: #{h.gaps}"
204
+ puts "\t mismatches: #{h.mismatches}"
205
+ puts "\t reversed: #{h.reversed}"
206
+ puts "\t score: #{h.score}"
207
+ puts "\t bit_score: #{h.bit_score}"
208
+ puts "\t ident: #{h.ident}"
209
+ puts "\t e_val: #{h.e_val}"
210
+ puts "\t definition: #{h.definition}"
211
+ puts "\t q_seq: #{h.q_seq}"
212
+ puts "\t s_seq: #{h.s_seq}"
213
+
214
+ end
215
+
216
+ end
217
+
218
+
219
+ def test_final_hit(final_hit, query_fasta)
220
+
221
+ puts "\t acc: #{final_hit.acc}"
222
+ puts "\t full_subject_length: #{final_hit.full_subject_length}"
223
+
224
+ puts "\n\t q_frame: #{final_hit.q_frame}"
225
+ puts "\t reversed: #{final_hit.reversed}"
226
+
227
+ puts "\n\t q_beg-q_end: #{final_hit.q_beg + 1} - #{final_hit.q_end + 1}"
228
+ puts "\t s_beg - s_end: #{final_hit.s_beg + 1} - #{final_hit.s_end + 1}"
229
+
230
+ puts "\n\t score: #{final_hit.score}, bit_score: #{final_hit.bit_score}, ident: #{final_hit.ident}, e_val: #{final_hit.e_val}"
231
+
232
+ puts "\n\t definition: #{final_hit.definition}"
233
+ puts "\t q_seq: #{final_hit.q_seq}"
234
+ puts "\t s_seq: #{final_hit.s_seq}"
235
+
236
+ puts "\nnt q_beg-q_end\n#{query_fasta[final_hit.q_beg..final_hit.q_end]}"
237
+ puts "\n\nprot q_beg-q_end\n#{query_fasta[final_hit.q_beg..final_hit.q_end].translate}"
238
+
239
+ end
240
+
241
+
242
+ def error_log(q, seq, warnings, db_name)
243
+ # seq.annotate(:error,"#{q.query_def}\t#{warnings}\t#{q.hits[0].definition}")
244
+
245
+ if (db_name =~ /^tr_/)
246
+ if (seq.get_annotations(:tmp_annotation).empty?)
247
+ if (seq.sec_desc.empty?)
248
+ if (!q.hits[0].definition.nil?)
249
+ warnings = "Coding sequence with some errors, #{warnings}"
250
+ seq.sec_desc = "#{q.query_def}\t#{seq.seq_fasta.length}\t#{q.hits[0].acc}\t#{db_name}\tCoding Seq\t\t#{q.hits[0].e_val}\t#{q.hits[0].ident}\t\t#{q.hits[0].full_subject_length}\t#{warnings}\t\t\t\t\t\t#{q.hits[0].definition}\t"
251
+ seq.annotate(:tmp_annotation,[seq.sec_desc, '','',''],true)
252
+ else
253
+ seq.annotate(:tcode,'')
254
+ end
255
+ else
256
+ warnings = "Coding sequence with some errors, #{warnings}"
257
+ tmp_annot = seq.sec_desc.sub('my_warning',"#{warnings}")
258
+ seq.annotate(:tmp_annotation,[tmp_annot, '','',''],true)
259
+ end
260
+ else
261
+ save_last_db_annotations(seq)
262
+ end
263
+ else
264
+ if (seq.sec_desc.empty?)
265
+ if (!q.hits[0].definition.nil?)
266
+ warnings = "Coding sequence with some errors, #{warnings}"
267
+ seq.sec_desc = "#{q.query_def}\t#{seq.seq_fasta.length}\t#{q.hits[0].acc}\t#{db_name}\tCoding Seq\t\t#{q.hits[0].e_val}\t#{q.hits[0].ident}\t\t#{q.hits[0].full_subject_length}\t#{warnings}\t\t\t\t\t\t#{q.hits[0].definition}\t"
268
+ end
269
+ end
270
+ end
271
+
272
+ end
273
+
274
+
275
+ def save_last_db_annotations(seq)
276
+
277
+ # puts "sequence not complete! recovering annotations from previous database! sldba!!"
278
+ (q, final_hit, final_prot, query_fasta, final_status) = seq.get_annotations(:tmp_annotation).first[:message][3]
279
+ print_nt_seqs(seq, q, final_hit, final_prot, query_fasta, final_status) # funcion para marcar ATG (_-_) y STOP (___)
280
+
281
+ (name,fasta_length,acc,db_name,final_status,testcode,e_val,ident,my_length,subject_length,warnings,q_frame,q_beg,q_end,s_beg,s_end,description,final_prot) = seq.get_annotations(:tmp_annotation).first[:message][0].split("\t")
282
+ if (final_hit.reversed)
283
+ (kk, q_frame, q_end, q_beg) = reverse_seq(query_fasta, q_frame.to_i, q_beg.to_i, q_end.to_i)
284
+ end
285
+
286
+ seq.annotate(:protein,seq.get_annotations(:tmp_annotation).first[:message][1])
287
+ seq.annotate(:alignment,seq.get_annotations(:tmp_annotation).first[:message][2])
288
+ tmp_annot = "#{name}\t#{fasta_length}\t#{acc}\t#{db_name}\t#{final_status}\t\t#{e_val}\t#{ident}\t#{my_length}\t#{subject_length}\t#{warnings}\t#{q_frame}\t#{q_beg}\t#{q_end}\t#{s_beg}\t#{s_end}\t#{description}\t#{final_prot}"
289
+ seq.annotate(:tmp_annotation,[tmp_annot, '','',''],true)
290
+
291
+ end
292
+
293
+
294
+ def find_start(subject_start, substring, fiable, aas_n_end)
295
+
296
+ tmp_prot = ''
297
+ msgs = ''
298
+ atg_status = 'incomplete' # complete, incomplete or putative
299
+
300
+ # puts "\nsubstring (#{substring.length} aas):\n#{substring}"
301
+ stop_codon = substring.rindex('*')
302
+
303
+ # marcamos la distancia al s_beg desde el principio del substring
304
+ # s_beg_distance = (substring.length) - subject_start
305
+ s_beg_distance = (substring.length - 10) - subject_start
306
+ # marcamos la distancia al s_beg desde el final del substring
307
+ atg_distance = (subject_start + 1) - (substring.length - 10)
308
+ if (atg_distance <= 0)
309
+ atg_distance = 0
310
+ else
311
+ # puts "expected atg_distance = 0, your sequence atg_distance = #{atg_distance}; limit (1-15)"
312
+ msgs = "atg_distance in limit (1-15): atg_distance = #{atg_distance}, "
313
+ end
314
+
315
+ # puts "s_beg_distance:#{s_beg_distance}, stop_codon: #{stop_codon}, subject_start: #{subject_start + 1}, atg_distance: #{atg_distance}"
316
+ #----------------------------------------------------------------------------------------------------------
317
+ # tenemos un codon de parada en el substring 5 prima
318
+ if (stop_codon)
319
+ stop_codon += 1
320
+ # ahora vamos a ver si el stop esta antes o despues del s_beg
321
+ if (stop_codon <= s_beg_distance) # esta antes
322
+ substring = substring[stop_codon, substring.length - stop_codon]
323
+ # puts "\nhay un codon de parada en el substring (#{substring.length} aas)\tstop_codon:#{stop_codon +1}\n#{substring}\n\n"
324
+
325
+ first_m = substring.index('M')
326
+
327
+ if (first_m) # tenemos M y stop ---------------------------------------------------------------------------
328
+ substring = substring[first_m, substring.length - first_m]
329
+
330
+ atg_status = 'complete'
331
+ else # con STOP pero sin M --------------------------------------------------------------------------------
332
+ atg_status = 'putative'
333
+ # puts "there is not a start codon near the expected beginning of your sequence, distance to subject ATG= #{atg_distance} aas --> good simil: #{fiable}"
334
+ msgs += "W1: There is no M at the beginning, "
335
+ end
336
+ #----------------------------------------------------------------------------------------------------------
337
+ else # esta despues, un cambio de fase impide analizar el principio
338
+ substring = substring[stop_codon, substring.length - stop_codon] # comentar?
339
+ first_m = substring.index('M') # comentar?
340
+ if (first_m) # tenemos M y unexpected stop # comentar?
341
+ substring = substring[first_m, substring.length - first_m] # comentar?
342
+ end # comentar?
343
+ # TODO esto se puede cambiar!
344
+ atg_status = 'putative'
345
+ msgs += " Unexpected STOP codon in 5 prime region, "
346
+ # puts "\nhay un codon de parada inesperado en el substring (#{substring.length} aas)\tstop_codon:#{stop_codon}, s_beg_distance: #{s_beg_distance +1}, atg_distance: #{atg_distance}"
347
+ end
348
+ #---------------------------------------------------------------------------------------------------------------
349
+ else # no hay stop codon
350
+ first_m = substring.index('M')
351
+ if (first_m) # tenemos M, sin stop
352
+ m_distance = subject_start - (substring.length - 10 - first_m)
353
+ substring = substring[first_m, substring.length - first_m]
354
+ # m_distance = [first_m+1,s_beg_distance].max - [first_m+1,s_beg_distance].min
355
+
356
+ if (m_distance > aas_n_end*2) # sin STOP, con atg pero muy lejos del inicio que marca el subject ---------------
357
+ # puts "No stop codon before M and M found is too far from subject M, distance to subject ATG= #{m_distance} aas --> good simil: #{fiable}"
358
+ msgs += "No stop codon before M and M found is too far from subject M, "
359
+ atg_status = 'incomplete'
360
+ else
361
+ if (fiable) # Tenemos M y aunque no hay STOP condon el ortologo es fiable ----------------------------------
362
+ # msgs += "No stop codon before M but high homology subject, "
363
+ atg_status = 'complete'
364
+ else # Tenemos M pero no tenemos stop y el ortologo no es fiable -------------------------------------------
365
+ # puts "No stop codon before M and low homology subject, distance to subject ATG= #{m_distance} aas --> good simil: #{fiable}"
366
+ msgs += "No stop codon before M and low homology subject, "
367
+ atg_status = 'putative'
368
+ end
369
+ end
370
+ else # sin M ni STOP -------------------------------------------------------------------------------------------
371
+ atg_status = 'putative'
372
+ # puts "your sequence has the subject beginning but there is not start codon at the beginning, distance to subject ATG= #{atg_distance} aas --> good simil: #{fiable}"
373
+ msgs += "W2: There is no M at the beginning, "
374
+ end
375
+ end
376
+
377
+ return [substring, atg_status, msgs]
378
+
379
+ end
380
+
381
+
382
+ def find_end(final_hit, q, full_prot, tmp_prot, end_status, warnings, aas_n_end)
383
+ # aqui vemos lo que queda sin similitud hasta el final
384
+ s_end_resto = (final_hit.full_subject_length - (final_hit.s_end.to_i + 1)) # en el subject, numero de aas que necesito cubrir
385
+ q_end_resto = (q.full_query_length.to_i - final_hit.q_end.to_i)/3 # en el query, numero de aas que tengo
386
+ sq_end_distance = q_end_resto - s_end_resto
387
+
388
+ cut_in_5p = full_prot.length - tmp_prot.length
389
+
390
+ resto_substring = tmp_prot[0..final_hit.q_end/3 - cut_in_5p - 16]
391
+ end_substring = tmp_prot[final_hit.q_end/3 - cut_in_5p - 15..tmp_prot.length]
392
+ putative_end = end_substring.index('*')
393
+
394
+ # si no tenemos suficiente secuencia para tener el stop (nos faltan 15 aas o mas)
395
+ if (sq_end_distance + aas_n_end < 0)
396
+ end_status = 'incomplete'
397
+ if (putative_end)
398
+ warnings += " Unexpected STOP codon at 3' end. Distance to subject end: #{sq_end_distance.abs} aas, "
399
+ end_substring = end_substring[0, putative_end+1] # comentar?
400
+ # if (@verbose)
401
+ # puts "#{db_name} -- #{q.query_def} --> Unexpected STOP codon at 3' end. Distance to subject end: #{sq_end_distance.abs} aas"
402
+ # end
403
+ else
404
+ warnings += "Distance to subject end: #{sq_end_distance.abs} aas, "
405
+ # if (@verbose)
406
+ # puts "#{db_name} -- #{q.query_def} --> Distance to subject end: #{sq_end_distance.abs} aas"
407
+ # end
408
+ end
409
+
410
+ else # tenemos suficiente secuencia
411
+ if (putative_end) # tenemos un stop
412
+ q_stop_resto = (putative_end - 15) # distancia entre el stop y el q_end, si es negativo el stop esta antes del q_end
413
+ qs_stop_distance = q_stop_resto - s_end_resto # distancia entre los stops del q y el s
414
+
415
+ # puts "putative_end: #{putative_end}, q_stop_resto: #{q_stop_resto}, qs_stop_distance: #{qs_stop_distance}"
416
+
417
+ if (qs_stop_distance + aas_n_end >= 0) # si q_end esta a menos de 15 aas antes o esta despues del s_end; complete
418
+ end_status = 'complete'
419
+ elsif (qs_stop_distance + 2*aas_n_end < 0) # si q_end es mas de 30 aas menor que el s_end; putative/Putative chimeric seq
420
+ end_status = 'putative'
421
+ warnings += " query STOP codon too far from subject stop. Distance to subject end: #{qs_stop_distance.abs} aas, putative chimeric sequence, "
422
+ # if (@verbose)
423
+ # puts "#{db_name} -- #{q.query_def} --> query STOP too far from subject stop. Distance to subject end: #{qs_stop_distance.abs} aas, putative chimeric sequence"
424
+ # end
425
+ elsif (qs_stop_distance + aas_n_end < 0) # si q_end es mas de 15 aas menor pero menos de 30 que el s_end; putative
426
+ end_status = 'putative'
427
+ warnings += " query STOP codon is far from subject stop. Distance to subject end: #{qs_stop_distance.abs} aas, "
428
+ # if (@verbose)
429
+ # puts "#{db_name} -- #{q.query_def} --> query STOP far from subject stop. Distance to subject end: #{qs_stop_distance.abs} aas"
430
+ # end
431
+ end
432
+ end_substring = end_substring[0, putative_end+1]
433
+
434
+ else # no tenemos codon de parada pero tenemos suficiente secuencia
435
+ end_status = 'putative'
436
+ warnings += " STOP codon was not found. Distance to subject end: #{sq_end_distance.abs} aas, "
437
+ # if (@verbose)
438
+ # puts "#{db_name} -- #{q.query_def} --> STOP codon was not found. Distance to subject end: #{sq_end_distance.abs} aas"
439
+ # end
440
+ end
441
+
442
+ end
443
+
444
+ return [resto_substring, end_substring, end_status, warnings, putative_end]
445
+ end
446
+
447
+
448
+ def determine_status(atg_status,end_status)
449
+
450
+ if (atg_status == 'complete') && (end_status == 'complete') # proteina completa
451
+ final_status = 'Complete'
452
+ elsif (atg_status == 'putative' && end_status == 'complete') || (atg_status == 'complete' && end_status == 'putative') || (atg_status == 'putative' && end_status == 'putative') # comienzo y/o final putative
453
+ final_status = 'Putative Complete'
454
+ elsif (atg_status == 'incomplete') && (end_status == 'incomplete') # region intermedia
455
+ final_status = 'Internal'
456
+ elsif (atg_status == 'complete') && (end_status == 'incomplete') # tenemos el principio de la proteina
457
+ final_status = 'N-terminus'
458
+ elsif (atg_status == 'putative') && (end_status == 'incomplete') # puede que tengamos el principio de la proteina
459
+ final_status = 'Putative N-terminus'
460
+ elsif (atg_status == 'incomplete') && (end_status == 'complete') # tenemos el final de la proteina
461
+ final_status = 'C-terminus'
462
+ elsif (atg_status == 'incomplete') && (end_status == 'putative') # puede que tengamos el final de la proteina
463
+ final_status = 'Putative C-terminus'
464
+ end
465
+
466
+ return final_status
467
+ end
468
+
469
+
470
+ def print_annotations(seq, q, final_hit, final_status, final_prot, warnings, query_fasta, db_name)
471
+ name_diff = q.query_def.length - final_hit.acc.length
472
+ if (name_diff > 0)
473
+ spnum = ' '*name_diff.to_i
474
+ else
475
+ spnum = ''
476
+ end
477
+ #-------------------------------------------------------------------------------------------------------------------------------------
478
+ # if the sequence is Complete will be printed --------------------------------------------------------------------
479
+ if (final_status == 'Complete')
480
+ seq.annotate(:protein,">#{q.query_def}\n#{final_prot}")
481
+ print_nt_seqs(seq, q, final_hit, final_prot, query_fasta, final_status) # funcion para marcar ATG (_-_) y STOP (___)
482
+
483
+ if (final_hit.reversed)
484
+ (kk, final_hit.q_frame, final_hit.q_end, final_hit.q_beg) = reverse_seq(seq.seq_fasta, final_hit.q_frame.to_i, final_hit.q_beg.to_i, final_hit.q_end.to_i)
485
+ end
486
+ seq.annotate(:complete,"#{q.query_def}\t#{query_fasta.length}\t#{final_hit.acc}\t#{db_name}\t#{final_status}\t\t#{final_hit.e_val}\t#{final_hit.ident}\t#{final_prot.length}\t#{final_hit.full_subject_length}\t#{warnings}\t#{final_hit.q_frame}\t#{final_hit.q_beg.to_i + 1}\t#{final_hit.q_end.to_i + 1}\t#{final_hit.s_beg.to_i + 1}\t#{final_hit.s_end.to_i + 1}\t#{final_hit.definition}\t#{final_prot}")
487
+ seq.annotate(:alignment,"#{q.query_def}\t#{final_hit.q_seq}\n#{final_hit.acc}#{spnum}\t#{final_hit.s_seq}\n\n")
488
+ #-------------------------------------------------------------------------------------------------------------------------------------
489
+ else # la proteina no esta completa -------------------------------------------------------------------------
490
+ if (!seq.get_annotations(:tmp_annotation).empty?) && (!seq.get_annotations(:tmp_annotation).nil?) # ---> trae informacion de una bd anterior
491
+ if (db_name =~/^tr_/) # ---> estamos usando el trembl, se dejan las anotaciones que trae
492
+ # puts "#{db_name} -- #{q.query_def} --> print_annotations: sequence not complete! recovering annotations from previous database!"
493
+ (kk1, final_hit, final_prot, query_fasta, final_status) = seq.get_annotations(:tmp_annotation).first[:message][3]
494
+ print_nt_seqs(seq, q, final_hit, final_prot, query_fasta, final_status) # funcion para marcar ATG (_-_) y STOP (___)
495
+
496
+ (name,fasta_length,acc,db_name,final_status,testcode,e_val,ident,my_length,subject_length,warnings,q_frame,q_beg,q_end,s_beg,s_end,description,final_prot) = seq.get_annotations(:tmp_annotation).first[:message][0].split("\t")
497
+ if (final_hit.reversed)
498
+ (kk, q_frame, q_end, q_beg) = reverse_seq(query_fasta, q_frame.to_i, q_beg.to_i, q_end.to_i)
499
+ end
500
+
501
+ my_prot = seq.get_annotations(:tmp_annotation).first[:message][1]
502
+ seq.annotate(:protein,my_prot)
503
+ my_align = seq.get_annotations(:tmp_annotation).first[:message][2]
504
+ seq.annotate(:alignment,my_align)
505
+
506
+ tmp_annot = "#{name}\t#{query_fasta.length}\t#{acc}\t#{db_name}\t#{final_status}\t\t#{e_val}\t#{ident}\t#{my_length}\t#{subject_length}\t#{warnings}\t#{q_frame}\t#{q_beg}\t#{q_end}\t#{s_beg}\t#{s_end}\t#{description}\t#{final_prot}"
507
+ seq.annotate(:tmp_annotation,[tmp_annot, '','',''],true)
508
+ #-----------------------------------------------------------------------------------------------------------------------------
509
+ # elsif (db_name =~ /^sp_/) # ---> estamos usando el sp, se dejan las anotaciones que trae
510
+
511
+ # puts "#{db_name} -- #{q.query_def} --> print_annotations: Mantenemos las anotaciones de la BD de usuario y pasamos la secuencia al trembl"
512
+ end
513
+ #-------------------------------------------------------------------------------------------------------------------------------------
514
+ elsif (seq.get_annotations(:tmp_annotation).empty?) # ---> NO trae informacion de una bd anterior
515
+ if (db_name =~ /^tr_/) # ---> estamos usando el trembl
516
+ # puts "#{db_name} -- #{q.query_def} --> print_annotations: #{q.query_def} is not complete!! se anota con trembl"
517
+ print_nt_seqs(seq, q, final_hit, final_prot, query_fasta, final_status) # funcion para marcar ATG (_-_) y STOP (___)
518
+
519
+ if (final_hit.reversed)
520
+ (kk, final_hit.q_frame, final_hit.q_end, final_hit.q_beg) = reverse_seq(seq.seq_fasta, final_hit.q_frame.to_i, final_hit.q_beg.to_i, final_hit.q_end.to_i)
521
+ end
522
+
523
+ seq.annotate(:alignment,"#{q.query_def}\t#{final_hit.q_seq}\n#{final_hit.acc}#{spnum}\t#{final_hit.s_seq}\n\n")
524
+ seq.annotate(:protein,">#{q.query_def}\n#{final_prot}")
525
+ tmp_annot = "#{q.query_def}\t#{query_fasta.length}\t#{final_hit.acc}\t#{db_name}\t#{final_status}\t\t#{final_hit.e_val}\t#{final_hit.ident}\t#{final_prot.length}\t#{final_hit.full_subject_length}\t#{warnings}\t#{final_hit.q_frame}\t#{final_hit.q_beg.to_i + 1}\t#{final_hit.q_end.to_i + 1}\t#{final_hit.s_beg.to_i + 1}\t#{final_hit.s_end.to_i + 1}\t#{final_hit.definition}\t#{final_prot}"
526
+ seq.annotate(:tmp_annotation,[tmp_annot, '','',''])
527
+ #-------------------------------------------------------------------------------------------------------------------------------------
528
+ else # cargamos anotaciones para la siguiente BD
529
+ tmp_prot = ">#{q.query_def}\n#{final_prot}"
530
+ tmp_align = "#{q.query_def}\t#{final_hit.q_seq}\n#{final_hit.acc}#{spnum}\t#{final_hit.s_seq}\n\n"
531
+ tmp_annot = "#{q.query_def}\t#{query_fasta.length}\t#{final_hit.acc}\t#{db_name}\t#{final_status}\t\t#{final_hit.e_val}\t#{final_hit.ident}\t#{final_prot.length}\t#{final_hit.full_subject_length}\t#{warnings}\t#{final_hit.q_frame}\t#{final_hit.q_beg.to_i + 1}\t#{final_hit.q_end.to_i + 1}\t#{final_hit.s_beg.to_i + 1}\t#{final_hit.s_end.to_i + 1}\t#{final_hit.definition}\t#{final_prot}"
532
+ seq.sec_desc = "#{q.query_def}\t#{query_fasta.length}\t#{final_hit.acc}\t#{db_name}\tCoding Seq\t\t#{final_hit.e_val}\t#{final_hit.ident}\t\t#{final_hit.full_subject_length}\t#{warnings}\t\t\t\t\t\t#{final_hit.definition}\t"
533
+ seq.annotate(:tmp_annotation,[tmp_annot, tmp_prot,tmp_align,[q, final_hit, final_prot, query_fasta, final_status]])
534
+
535
+ # puts "\n\n\n-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.---#{q.query_def}\t#{final_status}\n#{tmp_prot}"
536
+ # puts "#{db_name} -- #{q.query_def} --> print_annotations: cargamos anotaciones para utilizarlas en la siguiente BD"
537
+ end
538
+ end
539
+ end
540
+ end
541
+
542
+
543
+ def print_nt_seqs(seq, q, final_hit, final_prot, query_fasta, final_status)
544
+
545
+ bad_atg = false
546
+ #------------------------------------------------------------------------------------------------------------- ATG
547
+
548
+ if (final_status == 'Complete') || (final_status == 'Putative Complete') || (final_status == 'Putative N-terminus') || (final_status == 'N-terminus')
549
+ # puts "entra aqui, final_status: #{final_status}"
550
+ my_seq_n = query_fasta[final_hit.q_beg - 5..final_hit.q_beg + 5]
551
+
552
+ beg5 = false
553
+ # ------------------------------------- si my_seq_n = nil puede ser porque q_beg sea < 5
554
+ if (final_hit.q_beg < 6)
555
+ my_seq_n = query_fasta[0..10]
556
+ beg5 = true
557
+ # puts "empieza en el borde de la seq"
558
+ end
559
+
560
+ atg_found = my_seq_n.index(/ATG/i)
561
+ atg_found_rv = my_seq_n.rindex(/ATG/i)
562
+ my_atg_index = nil
563
+ end
564
+
565
+ if (!atg_found.nil?)
566
+ if (beg5)
567
+
568
+ my_seq_n.sub!(/ATG/i,'_-_ATG')
569
+ my_atg_index = atg_found
570
+ my_seq = my_seq_n + query_fasta[11..query_fasta.length + 1]
571
+
572
+ elsif (atg_found == atg_found_rv)
573
+
574
+ my_seq_n.sub!(/ATG/i,'_-_ATG')
575
+ my_atg_index = final_hit.q_beg - 5 + atg_found
576
+
577
+ my_seq = query_fasta[0..final_hit.q_beg - 6] + my_seq_n + query_fasta[final_hit.q_beg + 6..query_fasta.length + 1]
578
+
579
+ # puts "my_seq despues de encontrar el atg: #{my_seq}"
580
+ elsif (atg_found == 5) || (atg_found_rv == 5)
581
+
582
+ my_seq_n = my_seq_n[0..4]+'_-_'+my_seq_n[5..10]
583
+ my_atg_index = final_hit.q_beg - 5 + atg_found
584
+ my_seq = query_fasta[0..final_hit.q_beg - 6] + my_seq_n + query_fasta[final_hit.q_beg + 6..query_fasta.length + 1]
585
+
586
+ else
587
+
588
+ # puts "#{q.query_def} tiene mas de un ATG my_seq_n: #{my_seq_n}"
589
+ bad_atg = true
590
+ my_seq = query_fasta
591
+ end
592
+
593
+ else
594
+
595
+ bad_atg = true
596
+ # puts "#{q.query_def} NO TIENE ATG my_seq_n: #{my_seq_n}"
597
+ my_seq = query_fasta
598
+
599
+ end
600
+ #------------------------------------------------------------------------------------------------------------- STOP
601
+ stop_c = nil
602
+ if (final_status == 'Complete') || (final_status == 'Putative Complete') || (final_status == 'C-terminus') || (final_status == 'Putative C-terminus')
603
+
604
+ if (bad_atg == true)
605
+ stop_c = my_seq[final_hit.q_end - 2..final_hit.q_end]
606
+ stop_c_longer = my_seq[final_hit.q_end - 7..final_hit.q_end + 5]
607
+ else
608
+ stop_c = my_seq[final_hit.q_end + 3..final_hit.q_end + 5]
609
+ stop_c_longer = my_seq[final_hit.q_end - 2..final_hit.q_end + 10]
610
+ end
611
+
612
+ end
613
+
614
+ if (!stop_c.nil?)
615
+ # puts stop_c
616
+ # puts stop_c_longer
617
+ if (stop_c.translate == '*')
618
+
619
+ if (bad_atg == true)
620
+ my_seq = my_seq[0..final_hit.q_end] +'___'+ my_seq[final_hit.q_end + 1..my_seq.length + 1]
621
+ seq.annotate(:nucleotide,"#{q.query_def}\t#{final_status}\tNO ATG\t\t#{stop_c}\t#{final_hit.q_beg + 1}\t#{final_hit.q_end + 1}\t#{my_seq}")
622
+ else
623
+
624
+ my_seq = my_seq[0..final_hit.q_end + 5] +'___'+ my_seq[final_hit.q_end + 6..my_seq.length + 1]
625
+ my_prot = my_seq.sub(/\w+_\-_/,'')
626
+ my_prot = my_prot.sub(/___\w+/,'')
627
+ my_prot = my_prot.translate
628
+ my_prot = my_prot.sub(/x$/,'')
629
+
630
+ simliar_fragment = final_prot.lcs(my_prot)
631
+
632
+ if (simliar_fragment.length == final_prot.length) && (simliar_fragment.length == my_prot.length)
633
+ seq.annotate(:nucleotide,"#{q.query_def}\t#{final_status}\t\t\t\t\t\t#{my_seq}")
634
+ else
635
+ seq.annotate(:nucleotide,"#{q.query_def}\t#{final_status}\tthe nucleotide sequence contain a lot of errors\tstop: #{stop_c_longer}\t#{stop_c}\t#{final_hit.q_beg + 1}\t#{final_hit.q_end + 1}\t#{my_seq}")
636
+ # puts "nt seq: was no possible to find stop codon, the nucleotide sequence contain a lot of errors"
637
+ end
638
+
639
+ end
640
+
641
+ else
642
+ if (final_status == 'Putative Complete') || (final_status == 'C-terminus') || (final_status == 'Putative C-terminus')
643
+
644
+ if (bad_atg == true)
645
+ stop_c = my_seq[final_hit.q_end+1..final_hit.q_end+3]
646
+ stop_c_longer = my_seq[final_hit.q_end - 4..final_hit.q_end + 8]
647
+ else
648
+ stop_c = my_seq[final_hit.q_end + 7..final_hit.q_end + 9]
649
+ stop_c_longer = my_seq[final_hit.q_end..final_hit.q_end + 13]
650
+ end
651
+
652
+ if (!stop_c.nil?)
653
+ if (stop_c.translate == '*')
654
+ final_hit.q_end = final_hit.q_end + 3
655
+ if (bad_atg == true)
656
+ my_seq = my_seq[0..final_hit.q_end] +'___'+ my_seq[final_hit.q_end + 1..my_seq.length + 1]
657
+ seq.annotate(:nucleotide,"#{q.query_def}\t#{final_status}\tNO ATG\t\t#{stop_c}\t#{final_hit.q_beg + 1}\t#{final_hit.q_end + 1}\t#{my_seq}")
658
+ else
659
+ seq.annotate(:nucleotide,"#{q.query_def}\t#{final_status}\tNO STOP exacto\tstop: #{stop_c_longer}\t#{stop_c}\t#{final_hit.q_beg + 1}\t#{final_hit.q_end + 1}\t#{my_seq}")
660
+ end
661
+ else
662
+ if (bad_atg == true)
663
+ seq.annotate(:nucleotide,"#{q.query_def}\t#{final_status}\tNO ATG NO STOP exacto\tstop: #{stop_c_longer}\t#{stop_c}\t#{final_hit.q_beg + 1}\t#{final_hit.q_end + 1}\t#{my_seq}")
664
+ # puts "find nt end: NO ATG, NO exact STOP"
665
+ else
666
+ seq.annotate(:nucleotide,"#{q.query_def}\t#{final_status}\tNO STOP exacto\tstop: #{stop_c_longer}\t#{stop_c}\t#{final_hit.q_beg + 1}\t#{final_hit.q_end + 1}\t#{my_seq}")
667
+ # puts "find nt end: GOOD ATG, NO exact STOP"
668
+ end
669
+ end
670
+ end
671
+ end
672
+
673
+
674
+ end
675
+
676
+ else
677
+
678
+ if (bad_atg == true)
679
+ seq.annotate(:nucleotide,"#{q.query_def}\t#{final_status}\tNO ATG NO STOP\t\t\t\t\t#{my_seq}")
680
+ else
681
+ seq.annotate(:nucleotide,"#{q.query_def}\t#{final_status}\tNO STOP\t\t\t\t\t#{my_seq}")
682
+ end
683
+
684
+ end
685
+
686
+ end
687
+
688
+ end