full_lengther_next 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,688 @@
1
+
2
+ require 'une_los_hit'
3
+
4
+ module FlAnalysis
5
+
6
+ def analiza_orf_y_fl(seq, blast_query, options, db_name)
7
+ aas_n_end = options[:distance]
8
+ pident_threshold = options[:ident]
9
+ evalue_threshold = options[:evalue]
10
+ # @verbose = options[:verbose]
11
+
12
+ # test_blast_hits(blast_query)
13
+
14
+ # used to detect if the sequence and the blast are from different query
15
+ if seq.seq_name != blast_query.query_def
16
+ raise "BLAST query name and sequence are different"
17
+ end
18
+
19
+ q=blast_query
20
+ msgs = ''
21
+ atg_status = ''
22
+ end_status = ''
23
+ final_status = ''
24
+
25
+ # the fasta sequence is saved
26
+ query_fasta = seq.seq_fasta
27
+
28
+ if q.hits[0].nil? # There is no match in blast, the seq go to the next DB
29
+ # puts "#{db_name} -- #{q.query_def} --> NO BLASTX match"
30
+
31
+ # If the DB is trembl and the seq has annotations from other DB the annotations must be printed
32
+ if (db_name =~ /^tr_/)
33
+ if (seq.get_annotations(:tmp_annotation).empty?)
34
+ if (seq.sec_desc.empty?)
35
+ seq.annotate(:tcode,'')
36
+ else
37
+ seq.annotate(:tmp_annotation,[seq.sec_desc, '','',''],true)
38
+ end
39
+ else
40
+ save_last_db_annotations(seq)
41
+ end
42
+ end
43
+
44
+ return
45
+ end
46
+ #----------------------------------------------------------------------------------------------------------
47
+ warnings = ''
48
+ errors = ''
49
+ wrong_seq = false
50
+
51
+ # if the sequence has more than one hit, the frames are checked and fixed to get an single hit
52
+ if (q.hits.count > 1)
53
+
54
+ seq_unida = UneLosHit.new(q, query_fasta, pident_threshold)
55
+
56
+ wrong_seq = seq_unida.wrong_seq
57
+ is_ok = seq_unida.is_ok
58
+ q_index_start = seq_unida.q_index_start
59
+ full_prot = seq_unida.full_prot
60
+
61
+ query_fasta = seq_unida.output_seq # repaired fasta
62
+
63
+ final_hit = seq_unida.final_hit # single hit
64
+ msgs = seq_unida.msgs # warning messages
65
+ x_number = seq_unida.number_x # number of nucleotides used to fix frame errors
66
+
67
+ else # if there is only one hit
68
+
69
+ if (q.hits[0].q_frame.to_i < 0) # si la secuencia esta al reves le damos la vuelta
70
+ (query_fasta, q.hits[0].q_frame, q.hits[0].q_beg, q.hits[0].q_end) = reverse_seq(query_fasta, q.hits[0].q_frame, q.hits[0].q_beg, q.hits[0].q_end)
71
+ q.hits[0].reversed = true
72
+ end
73
+
74
+ final_hit = q.hits[0] # single hit
75
+ x_number = 0 # number of nucleotides used to fix frame errors
76
+
77
+ full_prot = query_fasta[final_hit.q_frame-1, query_fasta.length+1].translate
78
+ (is_ok, q_index_start) = contenidos_en_prot(final_hit, full_prot, q)
79
+ end
80
+ # test_final_hit(final_hit, query_fasta)
81
+ #----------------------------------------------------------------------------------------------------------
82
+ if wrong_seq
83
+ warnings = "ERROR#1, contains sense and antisense hits!!!, putative chimeric sequence, " + warnings
84
+ # puts "ERROR#1, contains sense and antisense hits!!!, putative chimeric sequence"
85
+ errors = "#{db_name}\t#{q.hits[0].acc}\tERROR#1\tcontains sense and antisense hits!!!, putative chimeric sequence, "
86
+ error_log(q, seq, warnings, db_name)
87
+ return
88
+ end
89
+ #----------------------------------------------------------------------------------------------------------
90
+ warnings += msgs
91
+ msgs = ''
92
+ #----------------------------------------------------------------------------------------------------------
93
+ if (x_number < 0)
94
+ warnings = "ERROR#2, unexpected negative index in x_number, " + warnings
95
+ # puts "ERROR#2, unexpected negative index in x_number"
96
+ errors = "#{db_name}\t#{q.hits[0].acc}\tERROR#2\tunexpected negative index in x_number, "
97
+ error_log(q, seq, warnings, db_name)
98
+ return
99
+ end
100
+ #----------------------------------------------------------------------------------------------------------
101
+ if (!is_ok)
102
+ warnings = "ERROR#3, very serious frame error, " + warnings
103
+ # puts "#{q.query_def} ERROR#3, hit was NOT found in the protein"
104
+ errors = "#{db_name}\t#{q.hits[0].acc}\tERROR#3\thit was NOT found in the protein, "
105
+ # error_log(q, seq, warnings, db_name)
106
+ # return
107
+ end
108
+ #----------------------------------------------------------------------------------------------------------
109
+ fiable = false
110
+ if ((final_hit.ident >= pident_threshold) && (final_hit.e_val <= evalue_threshold))
111
+ fiable = true
112
+ end
113
+ # if the query protein is large enough at the start of the sequence should have the start codon
114
+ if (final_hit.q_beg/3 + aas_n_end >= final_hit.s_beg.to_i)
115
+ substring = full_prot[0, q_index_start + 10]
116
+ resto_substring = full_prot[q_index_start + 10, full_prot.length - q_index_start - 10]
117
+
118
+ # to look for the beginning of the protein
119
+ (m_substring, atg_status, msgs) = find_start(final_hit.s_beg, substring, fiable, aas_n_end)
120
+
121
+ # pasting the substring sequence with the rest of the sequence
122
+ tmp_prot = "#{m_substring}#{resto_substring}"
123
+ # to get the value of the start_ORF index
124
+ final_hit.q_beg = final_hit.q_beg.to_i - ((m_substring.length - 10) * 3)
125
+ else
126
+ # if (@verbose)
127
+ # puts "beginning too short!"
128
+ # end
129
+
130
+ atg_status = 'incomplete'
131
+ substring = full_prot[0, q_index_start]
132
+ distance_s_atg = (final_hit.s_beg.to_i - final_hit.q_beg/3) + 1
133
+
134
+ if (substring.rindex('*'))
135
+ warnings += "Unexpected stop codon in the beginning of your sequence, "
136
+ # if (@verbose)
137
+ # puts "#{db_name} -- #{q.query_def} --> Unexpected stop codon in the beginning of your sequence"
138
+ # end
139
+ end
140
+
141
+ final_hit.q_beg = final_hit.q_beg.to_i - (substring.length * 3)
142
+ tmp_prot = full_prot
143
+ end
144
+ #----------------------------------------------------------------------------------------------------------
145
+ # look for the end of the protein
146
+ (resto_substring, end_substring, end_status, warnings, putative_end) = find_end(final_hit, q, full_prot, tmp_prot, end_status, warnings, aas_n_end)
147
+ #----------------------------------------------------------------------------------------------------------
148
+ final_prot = "#{resto_substring}#{end_substring}"
149
+
150
+ warnings += msgs
151
+
152
+ # to get the value of the end_ORF index
153
+ if (atg_status == 'complete')
154
+ final_hit.q_end = final_hit.q_beg - 3 + (final_prot.length * 3)
155
+ else
156
+ if (putative_end)
157
+ final_hit.q_end = final_hit.q_end - 45 + (putative_end*3)
158
+ end
159
+ end
160
+
161
+ #--------------------------------------------------------------------------------------------------------------
162
+ # decide the sequence status (Complete, Putative Complete, Internal, N-terminus, Putative N-terminus, C-terminus)
163
+ final_status = determine_status(atg_status,end_status)
164
+ #----------------------------------------------------------------------------------------------------------
165
+ if (final_prot.length - 2*aas_n_end > final_hit.full_subject_length)
166
+ warnings += " your sequence is longer than subject: #{final_prot.length} - #{final_hit.full_subject_length}"
167
+
168
+ elsif (final_prot.length + aas_n_end < final_hit.full_subject_length)
169
+ warnings += " your sequence is shorter than subject: #{final_prot.length} - #{final_hit.full_subject_length}"
170
+ if (final_prot.length + 100 < final_hit.full_subject_length) || (final_prot.length*2 < final_hit.full_subject_length)
171
+
172
+ if (final_status == 'Complete')
173
+ final_status = 'Putative Complete'
174
+ warnings += ". Was predicted as Complete, but is very much shorter than de subject"
175
+ # if (@verbose)
176
+ # puts "#{db_name} -- #{q.query_def} --> your sequence is 100 aas shorter than the subject or shorter than the half length of the subject"
177
+ # end
178
+ end
179
+ end
180
+ end
181
+
182
+ # test_final_hit(final_hit, query_fasta)
183
+ print_annotations(seq, q, final_hit, final_status, final_prot, warnings, query_fasta, db_name)
184
+
185
+ end
186
+
187
+
188
+ def test_blast_hits(q)
189
+
190
+ puts "query_def: #{q.query_def} full_query_length: #{q.full_query_length} ------------------------------------------------"
191
+
192
+ q.hits.each do |h|
193
+ puts "\t subject_id: #{h.acc}"
194
+ puts "\t acc: #{h.acc}"
195
+ puts "\t full_subject_length: #{h.full_subject_length}"
196
+ puts "\t q_beg: #{h.q_beg + 1}"
197
+ puts "\t q_end: #{h.q_end + 1}"
198
+ puts "\t q_frame: #{h.q_frame}"
199
+ puts "\t s_beg: #{h.s_beg + 1}"
200
+ puts "\t s_end: #{h.s_end + 1}"
201
+ puts "\t s_frame: #{h.s_frame}"
202
+ puts "\t align_len: #{h.align_len}"
203
+ puts "\t gaps: #{h.gaps}"
204
+ puts "\t mismatches: #{h.mismatches}"
205
+ puts "\t reversed: #{h.reversed}"
206
+ puts "\t score: #{h.score}"
207
+ puts "\t bit_score: #{h.bit_score}"
208
+ puts "\t ident: #{h.ident}"
209
+ puts "\t e_val: #{h.e_val}"
210
+ puts "\t definition: #{h.definition}"
211
+ puts "\t q_seq: #{h.q_seq}"
212
+ puts "\t s_seq: #{h.s_seq}"
213
+
214
+ end
215
+
216
+ end
217
+
218
+
219
+ def test_final_hit(final_hit, query_fasta)
220
+
221
+ puts "\t acc: #{final_hit.acc}"
222
+ puts "\t full_subject_length: #{final_hit.full_subject_length}"
223
+
224
+ puts "\n\t q_frame: #{final_hit.q_frame}"
225
+ puts "\t reversed: #{final_hit.reversed}"
226
+
227
+ puts "\n\t q_beg-q_end: #{final_hit.q_beg + 1} - #{final_hit.q_end + 1}"
228
+ puts "\t s_beg - s_end: #{final_hit.s_beg + 1} - #{final_hit.s_end + 1}"
229
+
230
+ puts "\n\t score: #{final_hit.score}, bit_score: #{final_hit.bit_score}, ident: #{final_hit.ident}, e_val: #{final_hit.e_val}"
231
+
232
+ puts "\n\t definition: #{final_hit.definition}"
233
+ puts "\t q_seq: #{final_hit.q_seq}"
234
+ puts "\t s_seq: #{final_hit.s_seq}"
235
+
236
+ puts "\nnt q_beg-q_end\n#{query_fasta[final_hit.q_beg..final_hit.q_end]}"
237
+ puts "\n\nprot q_beg-q_end\n#{query_fasta[final_hit.q_beg..final_hit.q_end].translate}"
238
+
239
+ end
240
+
241
+
242
+ def error_log(q, seq, warnings, db_name)
243
+ # seq.annotate(:error,"#{q.query_def}\t#{warnings}\t#{q.hits[0].definition}")
244
+
245
+ if (db_name =~ /^tr_/)
246
+ if (seq.get_annotations(:tmp_annotation).empty?)
247
+ if (seq.sec_desc.empty?)
248
+ if (!q.hits[0].definition.nil?)
249
+ warnings = "Coding sequence with some errors, #{warnings}"
250
+ seq.sec_desc = "#{q.query_def}\t#{seq.seq_fasta.length}\t#{q.hits[0].acc}\t#{db_name}\tCoding Seq\t\t#{q.hits[0].e_val}\t#{q.hits[0].ident}\t\t#{q.hits[0].full_subject_length}\t#{warnings}\t\t\t\t\t\t#{q.hits[0].definition}\t"
251
+ seq.annotate(:tmp_annotation,[seq.sec_desc, '','',''],true)
252
+ else
253
+ seq.annotate(:tcode,'')
254
+ end
255
+ else
256
+ warnings = "Coding sequence with some errors, #{warnings}"
257
+ tmp_annot = seq.sec_desc.sub('my_warning',"#{warnings}")
258
+ seq.annotate(:tmp_annotation,[tmp_annot, '','',''],true)
259
+ end
260
+ else
261
+ save_last_db_annotations(seq)
262
+ end
263
+ else
264
+ if (seq.sec_desc.empty?)
265
+ if (!q.hits[0].definition.nil?)
266
+ warnings = "Coding sequence with some errors, #{warnings}"
267
+ seq.sec_desc = "#{q.query_def}\t#{seq.seq_fasta.length}\t#{q.hits[0].acc}\t#{db_name}\tCoding Seq\t\t#{q.hits[0].e_val}\t#{q.hits[0].ident}\t\t#{q.hits[0].full_subject_length}\t#{warnings}\t\t\t\t\t\t#{q.hits[0].definition}\t"
268
+ end
269
+ end
270
+ end
271
+
272
+ end
273
+
274
+
275
+ def save_last_db_annotations(seq)
276
+
277
+ # puts "sequence not complete! recovering annotations from previous database! sldba!!"
278
+ (q, final_hit, final_prot, query_fasta, final_status) = seq.get_annotations(:tmp_annotation).first[:message][3]
279
+ print_nt_seqs(seq, q, final_hit, final_prot, query_fasta, final_status) # funcion para marcar ATG (_-_) y STOP (___)
280
+
281
+ (name,fasta_length,acc,db_name,final_status,testcode,e_val,ident,my_length,subject_length,warnings,q_frame,q_beg,q_end,s_beg,s_end,description,final_prot) = seq.get_annotations(:tmp_annotation).first[:message][0].split("\t")
282
+ if (final_hit.reversed)
283
+ (kk, q_frame, q_end, q_beg) = reverse_seq(query_fasta, q_frame.to_i, q_beg.to_i, q_end.to_i)
284
+ end
285
+
286
+ seq.annotate(:protein,seq.get_annotations(:tmp_annotation).first[:message][1])
287
+ seq.annotate(:alignment,seq.get_annotations(:tmp_annotation).first[:message][2])
288
+ tmp_annot = "#{name}\t#{fasta_length}\t#{acc}\t#{db_name}\t#{final_status}\t\t#{e_val}\t#{ident}\t#{my_length}\t#{subject_length}\t#{warnings}\t#{q_frame}\t#{q_beg}\t#{q_end}\t#{s_beg}\t#{s_end}\t#{description}\t#{final_prot}"
289
+ seq.annotate(:tmp_annotation,[tmp_annot, '','',''],true)
290
+
291
+ end
292
+
293
+
294
+ def find_start(subject_start, substring, fiable, aas_n_end)
295
+
296
+ tmp_prot = ''
297
+ msgs = ''
298
+ atg_status = 'incomplete' # complete, incomplete or putative
299
+
300
+ # puts "\nsubstring (#{substring.length} aas):\n#{substring}"
301
+ stop_codon = substring.rindex('*')
302
+
303
+ # marcamos la distancia al s_beg desde el principio del substring
304
+ # s_beg_distance = (substring.length) - subject_start
305
+ s_beg_distance = (substring.length - 10) - subject_start
306
+ # marcamos la distancia al s_beg desde el final del substring
307
+ atg_distance = (subject_start + 1) - (substring.length - 10)
308
+ if (atg_distance <= 0)
309
+ atg_distance = 0
310
+ else
311
+ # puts "expected atg_distance = 0, your sequence atg_distance = #{atg_distance}; limit (1-15)"
312
+ msgs = "atg_distance in limit (1-15): atg_distance = #{atg_distance}, "
313
+ end
314
+
315
+ # puts "s_beg_distance:#{s_beg_distance}, stop_codon: #{stop_codon}, subject_start: #{subject_start + 1}, atg_distance: #{atg_distance}"
316
+ #----------------------------------------------------------------------------------------------------------
317
+ # tenemos un codon de parada en el substring 5 prima
318
+ if (stop_codon)
319
+ stop_codon += 1
320
+ # ahora vamos a ver si el stop esta antes o despues del s_beg
321
+ if (stop_codon <= s_beg_distance) # esta antes
322
+ substring = substring[stop_codon, substring.length - stop_codon]
323
+ # puts "\nhay un codon de parada en el substring (#{substring.length} aas)\tstop_codon:#{stop_codon +1}\n#{substring}\n\n"
324
+
325
+ first_m = substring.index('M')
326
+
327
+ if (first_m) # tenemos M y stop ---------------------------------------------------------------------------
328
+ substring = substring[first_m, substring.length - first_m]
329
+
330
+ atg_status = 'complete'
331
+ else # con STOP pero sin M --------------------------------------------------------------------------------
332
+ atg_status = 'putative'
333
+ # puts "there is not a start codon near the expected beginning of your sequence, distance to subject ATG= #{atg_distance} aas --> good simil: #{fiable}"
334
+ msgs += "W1: There is no M at the beginning, "
335
+ end
336
+ #----------------------------------------------------------------------------------------------------------
337
+ else # esta despues, un cambio de fase impide analizar el principio
338
+ substring = substring[stop_codon, substring.length - stop_codon] # comentar?
339
+ first_m = substring.index('M') # comentar?
340
+ if (first_m) # tenemos M y unexpected stop # comentar?
341
+ substring = substring[first_m, substring.length - first_m] # comentar?
342
+ end # comentar?
343
+ # TODO esto se puede cambiar!
344
+ atg_status = 'putative'
345
+ msgs += " Unexpected STOP codon in 5 prime region, "
346
+ # puts "\nhay un codon de parada inesperado en el substring (#{substring.length} aas)\tstop_codon:#{stop_codon}, s_beg_distance: #{s_beg_distance +1}, atg_distance: #{atg_distance}"
347
+ end
348
+ #---------------------------------------------------------------------------------------------------------------
349
+ else # no hay stop codon
350
+ first_m = substring.index('M')
351
+ if (first_m) # tenemos M, sin stop
352
+ m_distance = subject_start - (substring.length - 10 - first_m)
353
+ substring = substring[first_m, substring.length - first_m]
354
+ # m_distance = [first_m+1,s_beg_distance].max - [first_m+1,s_beg_distance].min
355
+
356
+ if (m_distance > aas_n_end*2) # sin STOP, con atg pero muy lejos del inicio que marca el subject ---------------
357
+ # puts "No stop codon before M and M found is too far from subject M, distance to subject ATG= #{m_distance} aas --> good simil: #{fiable}"
358
+ msgs += "No stop codon before M and M found is too far from subject M, "
359
+ atg_status = 'incomplete'
360
+ else
361
+ if (fiable) # Tenemos M y aunque no hay STOP condon el ortologo es fiable ----------------------------------
362
+ # msgs += "No stop codon before M but high homology subject, "
363
+ atg_status = 'complete'
364
+ else # Tenemos M pero no tenemos stop y el ortologo no es fiable -------------------------------------------
365
+ # puts "No stop codon before M and low homology subject, distance to subject ATG= #{m_distance} aas --> good simil: #{fiable}"
366
+ msgs += "No stop codon before M and low homology subject, "
367
+ atg_status = 'putative'
368
+ end
369
+ end
370
+ else # sin M ni STOP -------------------------------------------------------------------------------------------
371
+ atg_status = 'putative'
372
+ # puts "your sequence has the subject beginning but there is not start codon at the beginning, distance to subject ATG= #{atg_distance} aas --> good simil: #{fiable}"
373
+ msgs += "W2: There is no M at the beginning, "
374
+ end
375
+ end
376
+
377
+ return [substring, atg_status, msgs]
378
+
379
+ end
380
+
381
+
382
+ def find_end(final_hit, q, full_prot, tmp_prot, end_status, warnings, aas_n_end)
383
+ # aqui vemos lo que queda sin similitud hasta el final
384
+ s_end_resto = (final_hit.full_subject_length - (final_hit.s_end.to_i + 1)) # en el subject, numero de aas que necesito cubrir
385
+ q_end_resto = (q.full_query_length.to_i - final_hit.q_end.to_i)/3 # en el query, numero de aas que tengo
386
+ sq_end_distance = q_end_resto - s_end_resto
387
+
388
+ cut_in_5p = full_prot.length - tmp_prot.length
389
+
390
+ resto_substring = tmp_prot[0..final_hit.q_end/3 - cut_in_5p - 16]
391
+ end_substring = tmp_prot[final_hit.q_end/3 - cut_in_5p - 15..tmp_prot.length]
392
+ putative_end = end_substring.index('*')
393
+
394
+ # si no tenemos suficiente secuencia para tener el stop (nos faltan 15 aas o mas)
395
+ if (sq_end_distance + aas_n_end < 0)
396
+ end_status = 'incomplete'
397
+ if (putative_end)
398
+ warnings += " Unexpected STOP codon at 3' end. Distance to subject end: #{sq_end_distance.abs} aas, "
399
+ end_substring = end_substring[0, putative_end+1] # comentar?
400
+ # if (@verbose)
401
+ # puts "#{db_name} -- #{q.query_def} --> Unexpected STOP codon at 3' end. Distance to subject end: #{sq_end_distance.abs} aas"
402
+ # end
403
+ else
404
+ warnings += "Distance to subject end: #{sq_end_distance.abs} aas, "
405
+ # if (@verbose)
406
+ # puts "#{db_name} -- #{q.query_def} --> Distance to subject end: #{sq_end_distance.abs} aas"
407
+ # end
408
+ end
409
+
410
+ else # tenemos suficiente secuencia
411
+ if (putative_end) # tenemos un stop
412
+ q_stop_resto = (putative_end - 15) # distancia entre el stop y el q_end, si es negativo el stop esta antes del q_end
413
+ qs_stop_distance = q_stop_resto - s_end_resto # distancia entre los stops del q y el s
414
+
415
+ # puts "putative_end: #{putative_end}, q_stop_resto: #{q_stop_resto}, qs_stop_distance: #{qs_stop_distance}"
416
+
417
+ if (qs_stop_distance + aas_n_end >= 0) # si q_end esta a menos de 15 aas antes o esta despues del s_end; complete
418
+ end_status = 'complete'
419
+ elsif (qs_stop_distance + 2*aas_n_end < 0) # si q_end es mas de 30 aas menor que el s_end; putative/Putative chimeric seq
420
+ end_status = 'putative'
421
+ warnings += " query STOP codon too far from subject stop. Distance to subject end: #{qs_stop_distance.abs} aas, putative chimeric sequence, "
422
+ # if (@verbose)
423
+ # puts "#{db_name} -- #{q.query_def} --> query STOP too far from subject stop. Distance to subject end: #{qs_stop_distance.abs} aas, putative chimeric sequence"
424
+ # end
425
+ elsif (qs_stop_distance + aas_n_end < 0) # si q_end es mas de 15 aas menor pero menos de 30 que el s_end; putative
426
+ end_status = 'putative'
427
+ warnings += " query STOP codon is far from subject stop. Distance to subject end: #{qs_stop_distance.abs} aas, "
428
+ # if (@verbose)
429
+ # puts "#{db_name} -- #{q.query_def} --> query STOP far from subject stop. Distance to subject end: #{qs_stop_distance.abs} aas"
430
+ # end
431
+ end
432
+ end_substring = end_substring[0, putative_end+1]
433
+
434
+ else # no tenemos codon de parada pero tenemos suficiente secuencia
435
+ end_status = 'putative'
436
+ warnings += " STOP codon was not found. Distance to subject end: #{sq_end_distance.abs} aas, "
437
+ # if (@verbose)
438
+ # puts "#{db_name} -- #{q.query_def} --> STOP codon was not found. Distance to subject end: #{sq_end_distance.abs} aas"
439
+ # end
440
+ end
441
+
442
+ end
443
+
444
+ return [resto_substring, end_substring, end_status, warnings, putative_end]
445
+ end
446
+
447
+
448
+ def determine_status(atg_status,end_status)
449
+
450
+ if (atg_status == 'complete') && (end_status == 'complete') # proteina completa
451
+ final_status = 'Complete'
452
+ elsif (atg_status == 'putative' && end_status == 'complete') || (atg_status == 'complete' && end_status == 'putative') || (atg_status == 'putative' && end_status == 'putative') # comienzo y/o final putative
453
+ final_status = 'Putative Complete'
454
+ elsif (atg_status == 'incomplete') && (end_status == 'incomplete') # region intermedia
455
+ final_status = 'Internal'
456
+ elsif (atg_status == 'complete') && (end_status == 'incomplete') # tenemos el principio de la proteina
457
+ final_status = 'N-terminus'
458
+ elsif (atg_status == 'putative') && (end_status == 'incomplete') # puede que tengamos el principio de la proteina
459
+ final_status = 'Putative N-terminus'
460
+ elsif (atg_status == 'incomplete') && (end_status == 'complete') # tenemos el final de la proteina
461
+ final_status = 'C-terminus'
462
+ elsif (atg_status == 'incomplete') && (end_status == 'putative') # puede que tengamos el final de la proteina
463
+ final_status = 'Putative C-terminus'
464
+ end
465
+
466
+ return final_status
467
+ end
468
+
469
+
470
+ def print_annotations(seq, q, final_hit, final_status, final_prot, warnings, query_fasta, db_name)
471
+ name_diff = q.query_def.length - final_hit.acc.length
472
+ if (name_diff > 0)
473
+ spnum = ' '*name_diff.to_i
474
+ else
475
+ spnum = ''
476
+ end
477
+ #-------------------------------------------------------------------------------------------------------------------------------------
478
+ # if the sequence is Complete will be printed --------------------------------------------------------------------
479
+ if (final_status == 'Complete')
480
+ seq.annotate(:protein,">#{q.query_def}\n#{final_prot}")
481
+ print_nt_seqs(seq, q, final_hit, final_prot, query_fasta, final_status) # funcion para marcar ATG (_-_) y STOP (___)
482
+
483
+ if (final_hit.reversed)
484
+ (kk, final_hit.q_frame, final_hit.q_end, final_hit.q_beg) = reverse_seq(seq.seq_fasta, final_hit.q_frame.to_i, final_hit.q_beg.to_i, final_hit.q_end.to_i)
485
+ end
486
+ seq.annotate(:complete,"#{q.query_def}\t#{query_fasta.length}\t#{final_hit.acc}\t#{db_name}\t#{final_status}\t\t#{final_hit.e_val}\t#{final_hit.ident}\t#{final_prot.length}\t#{final_hit.full_subject_length}\t#{warnings}\t#{final_hit.q_frame}\t#{final_hit.q_beg.to_i + 1}\t#{final_hit.q_end.to_i + 1}\t#{final_hit.s_beg.to_i + 1}\t#{final_hit.s_end.to_i + 1}\t#{final_hit.definition}\t#{final_prot}")
487
+ seq.annotate(:alignment,"#{q.query_def}\t#{final_hit.q_seq}\n#{final_hit.acc}#{spnum}\t#{final_hit.s_seq}\n\n")
488
+ #-------------------------------------------------------------------------------------------------------------------------------------
489
+ else # la proteina no esta completa -------------------------------------------------------------------------
490
+ if (!seq.get_annotations(:tmp_annotation).empty?) && (!seq.get_annotations(:tmp_annotation).nil?) # ---> trae informacion de una bd anterior
491
+ if (db_name =~/^tr_/) # ---> estamos usando el trembl, se dejan las anotaciones que trae
492
+ # puts "#{db_name} -- #{q.query_def} --> print_annotations: sequence not complete! recovering annotations from previous database!"
493
+ (kk1, final_hit, final_prot, query_fasta, final_status) = seq.get_annotations(:tmp_annotation).first[:message][3]
494
+ print_nt_seqs(seq, q, final_hit, final_prot, query_fasta, final_status) # funcion para marcar ATG (_-_) y STOP (___)
495
+
496
+ (name,fasta_length,acc,db_name,final_status,testcode,e_val,ident,my_length,subject_length,warnings,q_frame,q_beg,q_end,s_beg,s_end,description,final_prot) = seq.get_annotations(:tmp_annotation).first[:message][0].split("\t")
497
+ if (final_hit.reversed)
498
+ (kk, q_frame, q_end, q_beg) = reverse_seq(query_fasta, q_frame.to_i, q_beg.to_i, q_end.to_i)
499
+ end
500
+
501
+ my_prot = seq.get_annotations(:tmp_annotation).first[:message][1]
502
+ seq.annotate(:protein,my_prot)
503
+ my_align = seq.get_annotations(:tmp_annotation).first[:message][2]
504
+ seq.annotate(:alignment,my_align)
505
+
506
+ tmp_annot = "#{name}\t#{query_fasta.length}\t#{acc}\t#{db_name}\t#{final_status}\t\t#{e_val}\t#{ident}\t#{my_length}\t#{subject_length}\t#{warnings}\t#{q_frame}\t#{q_beg}\t#{q_end}\t#{s_beg}\t#{s_end}\t#{description}\t#{final_prot}"
507
+ seq.annotate(:tmp_annotation,[tmp_annot, '','',''],true)
508
+ #-----------------------------------------------------------------------------------------------------------------------------
509
+ # elsif (db_name =~ /^sp_/) # ---> estamos usando el sp, se dejan las anotaciones que trae
510
+
511
+ # puts "#{db_name} -- #{q.query_def} --> print_annotations: Mantenemos las anotaciones de la BD de usuario y pasamos la secuencia al trembl"
512
+ end
513
+ #-------------------------------------------------------------------------------------------------------------------------------------
514
+ elsif (seq.get_annotations(:tmp_annotation).empty?) # ---> NO trae informacion de una bd anterior
515
+ if (db_name =~ /^tr_/) # ---> estamos usando el trembl
516
+ # puts "#{db_name} -- #{q.query_def} --> print_annotations: #{q.query_def} is not complete!! se anota con trembl"
517
+ print_nt_seqs(seq, q, final_hit, final_prot, query_fasta, final_status) # funcion para marcar ATG (_-_) y STOP (___)
518
+
519
+ if (final_hit.reversed)
520
+ (kk, final_hit.q_frame, final_hit.q_end, final_hit.q_beg) = reverse_seq(seq.seq_fasta, final_hit.q_frame.to_i, final_hit.q_beg.to_i, final_hit.q_end.to_i)
521
+ end
522
+
523
+ seq.annotate(:alignment,"#{q.query_def}\t#{final_hit.q_seq}\n#{final_hit.acc}#{spnum}\t#{final_hit.s_seq}\n\n")
524
+ seq.annotate(:protein,">#{q.query_def}\n#{final_prot}")
525
+ tmp_annot = "#{q.query_def}\t#{query_fasta.length}\t#{final_hit.acc}\t#{db_name}\t#{final_status}\t\t#{final_hit.e_val}\t#{final_hit.ident}\t#{final_prot.length}\t#{final_hit.full_subject_length}\t#{warnings}\t#{final_hit.q_frame}\t#{final_hit.q_beg.to_i + 1}\t#{final_hit.q_end.to_i + 1}\t#{final_hit.s_beg.to_i + 1}\t#{final_hit.s_end.to_i + 1}\t#{final_hit.definition}\t#{final_prot}"
526
+ seq.annotate(:tmp_annotation,[tmp_annot, '','',''])
527
+ #-------------------------------------------------------------------------------------------------------------------------------------
528
+ else # cargamos anotaciones para la siguiente BD
529
+ tmp_prot = ">#{q.query_def}\n#{final_prot}"
530
+ tmp_align = "#{q.query_def}\t#{final_hit.q_seq}\n#{final_hit.acc}#{spnum}\t#{final_hit.s_seq}\n\n"
531
+ tmp_annot = "#{q.query_def}\t#{query_fasta.length}\t#{final_hit.acc}\t#{db_name}\t#{final_status}\t\t#{final_hit.e_val}\t#{final_hit.ident}\t#{final_prot.length}\t#{final_hit.full_subject_length}\t#{warnings}\t#{final_hit.q_frame}\t#{final_hit.q_beg.to_i + 1}\t#{final_hit.q_end.to_i + 1}\t#{final_hit.s_beg.to_i + 1}\t#{final_hit.s_end.to_i + 1}\t#{final_hit.definition}\t#{final_prot}"
532
+ seq.sec_desc = "#{q.query_def}\t#{query_fasta.length}\t#{final_hit.acc}\t#{db_name}\tCoding Seq\t\t#{final_hit.e_val}\t#{final_hit.ident}\t\t#{final_hit.full_subject_length}\t#{warnings}\t\t\t\t\t\t#{final_hit.definition}\t"
533
+ seq.annotate(:tmp_annotation,[tmp_annot, tmp_prot,tmp_align,[q, final_hit, final_prot, query_fasta, final_status]])
534
+
535
+ # puts "\n\n\n-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.---#{q.query_def}\t#{final_status}\n#{tmp_prot}"
536
+ # puts "#{db_name} -- #{q.query_def} --> print_annotations: cargamos anotaciones para utilizarlas en la siguiente BD"
537
+ end
538
+ end
539
+ end
540
+ end
541
+
542
+
543
+ def print_nt_seqs(seq, q, final_hit, final_prot, query_fasta, final_status)
544
+
545
+ bad_atg = false
546
+ #------------------------------------------------------------------------------------------------------------- ATG
547
+
548
+ if (final_status == 'Complete') || (final_status == 'Putative Complete') || (final_status == 'Putative N-terminus') || (final_status == 'N-terminus')
549
+ # puts "entra aqui, final_status: #{final_status}"
550
+ my_seq_n = query_fasta[final_hit.q_beg - 5..final_hit.q_beg + 5]
551
+
552
+ beg5 = false
553
+ # ------------------------------------- si my_seq_n = nil puede ser porque q_beg sea < 5
554
+ if (final_hit.q_beg < 6)
555
+ my_seq_n = query_fasta[0..10]
556
+ beg5 = true
557
+ # puts "empieza en el borde de la seq"
558
+ end
559
+
560
+ atg_found = my_seq_n.index(/ATG/i)
561
+ atg_found_rv = my_seq_n.rindex(/ATG/i)
562
+ my_atg_index = nil
563
+ end
564
+
565
+ if (!atg_found.nil?)
566
+ if (beg5)
567
+
568
+ my_seq_n.sub!(/ATG/i,'_-_ATG')
569
+ my_atg_index = atg_found
570
+ my_seq = my_seq_n + query_fasta[11..query_fasta.length + 1]
571
+
572
+ elsif (atg_found == atg_found_rv)
573
+
574
+ my_seq_n.sub!(/ATG/i,'_-_ATG')
575
+ my_atg_index = final_hit.q_beg - 5 + atg_found
576
+
577
+ my_seq = query_fasta[0..final_hit.q_beg - 6] + my_seq_n + query_fasta[final_hit.q_beg + 6..query_fasta.length + 1]
578
+
579
+ # puts "my_seq despues de encontrar el atg: #{my_seq}"
580
+ elsif (atg_found == 5) || (atg_found_rv == 5)
581
+
582
+ my_seq_n = my_seq_n[0..4]+'_-_'+my_seq_n[5..10]
583
+ my_atg_index = final_hit.q_beg - 5 + atg_found
584
+ my_seq = query_fasta[0..final_hit.q_beg - 6] + my_seq_n + query_fasta[final_hit.q_beg + 6..query_fasta.length + 1]
585
+
586
+ else
587
+
588
+ # puts "#{q.query_def} tiene mas de un ATG my_seq_n: #{my_seq_n}"
589
+ bad_atg = true
590
+ my_seq = query_fasta
591
+ end
592
+
593
+ else
594
+
595
+ bad_atg = true
596
+ # puts "#{q.query_def} NO TIENE ATG my_seq_n: #{my_seq_n}"
597
+ my_seq = query_fasta
598
+
599
+ end
600
+ #------------------------------------------------------------------------------------------------------------- STOP
601
+ stop_c = nil
602
+ if (final_status == 'Complete') || (final_status == 'Putative Complete') || (final_status == 'C-terminus') || (final_status == 'Putative C-terminus')
603
+
604
+ if (bad_atg == true)
605
+ stop_c = my_seq[final_hit.q_end - 2..final_hit.q_end]
606
+ stop_c_longer = my_seq[final_hit.q_end - 7..final_hit.q_end + 5]
607
+ else
608
+ stop_c = my_seq[final_hit.q_end + 3..final_hit.q_end + 5]
609
+ stop_c_longer = my_seq[final_hit.q_end - 2..final_hit.q_end + 10]
610
+ end
611
+
612
+ end
613
+
614
+ if (!stop_c.nil?)
615
+ # puts stop_c
616
+ # puts stop_c_longer
617
+ if (stop_c.translate == '*')
618
+
619
+ if (bad_atg == true)
620
+ my_seq = my_seq[0..final_hit.q_end] +'___'+ my_seq[final_hit.q_end + 1..my_seq.length + 1]
621
+ seq.annotate(:nucleotide,"#{q.query_def}\t#{final_status}\tNO ATG\t\t#{stop_c}\t#{final_hit.q_beg + 1}\t#{final_hit.q_end + 1}\t#{my_seq}")
622
+ else
623
+
624
+ my_seq = my_seq[0..final_hit.q_end + 5] +'___'+ my_seq[final_hit.q_end + 6..my_seq.length + 1]
625
+ my_prot = my_seq.sub(/\w+_\-_/,'')
626
+ my_prot = my_prot.sub(/___\w+/,'')
627
+ my_prot = my_prot.translate
628
+ my_prot = my_prot.sub(/x$/,'')
629
+
630
+ simliar_fragment = final_prot.lcs(my_prot)
631
+
632
+ if (simliar_fragment.length == final_prot.length) && (simliar_fragment.length == my_prot.length)
633
+ seq.annotate(:nucleotide,"#{q.query_def}\t#{final_status}\t\t\t\t\t\t#{my_seq}")
634
+ else
635
+ seq.annotate(:nucleotide,"#{q.query_def}\t#{final_status}\tthe nucleotide sequence contain a lot of errors\tstop: #{stop_c_longer}\t#{stop_c}\t#{final_hit.q_beg + 1}\t#{final_hit.q_end + 1}\t#{my_seq}")
636
+ # puts "nt seq: was no possible to find stop codon, the nucleotide sequence contain a lot of errors"
637
+ end
638
+
639
+ end
640
+
641
+ else
642
+ if (final_status == 'Putative Complete') || (final_status == 'C-terminus') || (final_status == 'Putative C-terminus')
643
+
644
+ if (bad_atg == true)
645
+ stop_c = my_seq[final_hit.q_end+1..final_hit.q_end+3]
646
+ stop_c_longer = my_seq[final_hit.q_end - 4..final_hit.q_end + 8]
647
+ else
648
+ stop_c = my_seq[final_hit.q_end + 7..final_hit.q_end + 9]
649
+ stop_c_longer = my_seq[final_hit.q_end..final_hit.q_end + 13]
650
+ end
651
+
652
+ if (!stop_c.nil?)
653
+ if (stop_c.translate == '*')
654
+ final_hit.q_end = final_hit.q_end + 3
655
+ if (bad_atg == true)
656
+ my_seq = my_seq[0..final_hit.q_end] +'___'+ my_seq[final_hit.q_end + 1..my_seq.length + 1]
657
+ seq.annotate(:nucleotide,"#{q.query_def}\t#{final_status}\tNO ATG\t\t#{stop_c}\t#{final_hit.q_beg + 1}\t#{final_hit.q_end + 1}\t#{my_seq}")
658
+ else
659
+ seq.annotate(:nucleotide,"#{q.query_def}\t#{final_status}\tNO STOP exacto\tstop: #{stop_c_longer}\t#{stop_c}\t#{final_hit.q_beg + 1}\t#{final_hit.q_end + 1}\t#{my_seq}")
660
+ end
661
+ else
662
+ if (bad_atg == true)
663
+ seq.annotate(:nucleotide,"#{q.query_def}\t#{final_status}\tNO ATG NO STOP exacto\tstop: #{stop_c_longer}\t#{stop_c}\t#{final_hit.q_beg + 1}\t#{final_hit.q_end + 1}\t#{my_seq}")
664
+ # puts "find nt end: NO ATG, NO exact STOP"
665
+ else
666
+ seq.annotate(:nucleotide,"#{q.query_def}\t#{final_status}\tNO STOP exacto\tstop: #{stop_c_longer}\t#{stop_c}\t#{final_hit.q_beg + 1}\t#{final_hit.q_end + 1}\t#{my_seq}")
667
+ # puts "find nt end: GOOD ATG, NO exact STOP"
668
+ end
669
+ end
670
+ end
671
+ end
672
+
673
+
674
+ end
675
+
676
+ else
677
+
678
+ if (bad_atg == true)
679
+ seq.annotate(:nucleotide,"#{q.query_def}\t#{final_status}\tNO ATG NO STOP\t\t\t\t\t#{my_seq}")
680
+ else
681
+ seq.annotate(:nucleotide,"#{q.query_def}\t#{final_status}\tNO STOP\t\t\t\t\t#{my_seq}")
682
+ end
683
+
684
+ end
685
+
686
+ end
687
+
688
+ end