full_lengther_next 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,32 @@
1
+
2
+ class Orf
3
+
4
+ attr_accessor :seq,:t_start,:t_end,:frame,:stop_codon,:type,:status,:score, :more_than_one_frame
5
+
6
+ def initialize(orf_seq, t_start, t_end, frame, stop_codon, type)
7
+ @seq=orf_seq
8
+ @t_start=t_start
9
+ @t_end=t_end
10
+ @frame=frame
11
+ @stop_codon=stop_codon
12
+ @type=type # :N_terminus,:C_terminus,:Complete,:Internal,:Putative_Complete,:Putative_N_terminus
13
+ @status = :unknown # :unknown,:putative_coding,:coding
14
+ @score = 0
15
+ @more_than_one_frame = false
16
+ end
17
+
18
+ def overlaps?(other_orf)
19
+ overlap_status = false
20
+ i1 = self.t_start
21
+ i2 = other_orf.t_start
22
+ e1 = self.t_end
23
+ e2 = other_orf.t_end
24
+
25
+ if (e1 > i2) && (e1 < e2)
26
+ overlap_status = true
27
+ end
28
+
29
+ return overlap_status
30
+ end
31
+
32
+ end
@@ -0,0 +1,111 @@
1
+
2
+ require 'orf'
3
+
4
+ class Sequence
5
+
6
+ attr_accessor :seq_name,:seq_fasta,:seq_qual,:orfs,:sec_desc
7
+
8
+ def initialize(seq_name,seq_fasta,seq_qual='')
9
+ @seq_name=seq_name
10
+ @seq_fasta = seq_fasta
11
+ change_degenerated_nt!
12
+ @seq_qual = ''
13
+ @sec_desc = ''
14
+ @annotations=[]
15
+ @orfs=[]
16
+
17
+ @rejected=false
18
+ @rejected_message=''
19
+
20
+ end
21
+
22
+ def add_orf(orf_seq, orf_t_start, orf_t_end, orf_frame, orf_stop_codon, orf_type)
23
+ orf = Orf.new(orf_seq, orf_t_start, orf_t_end, orf_frame, orf_stop_codon, orf_type)
24
+ @orfs.push orf
25
+
26
+ end
27
+
28
+ def rejected?
29
+ return @rejected
30
+ end
31
+
32
+ def reject!(message='')
33
+ @rejected=true
34
+ @rejected_message=message
35
+ end
36
+
37
+ # :complete, :tmp_annotation, :error, :protein, :nucleotide, :alignment, :tcode
38
+ def get_annotations(annotation_type)
39
+ return @annotations.select{|a| a[:annotation_type]==annotation_type}
40
+ end
41
+
42
+ def annotate(annotation_type, message='', replace_existing = false)
43
+
44
+ if replace_existing
45
+ @annotations.reverse_each do |annotation|
46
+ if annotation[:annotation_type]==annotation_type
47
+ @annotations.delete(annotation)
48
+ end
49
+ end
50
+ end
51
+
52
+
53
+ @annotations.push({:annotation_type=>annotation_type,:message=>message})
54
+ end
55
+
56
+ def change_degenerated_nt!
57
+
58
+
59
+ ########################################
60
+
61
+ tranlaste_hash = {}
62
+ tranlaste_hash['R']= [['a','g'],0]
63
+ tranlaste_hash['W']= [['a','t'],0]
64
+ tranlaste_hash['M']= [['a','c'],0]
65
+ tranlaste_hash['K']= [['g','t'],0]
66
+ tranlaste_hash['S']= [['g','c'],0]
67
+ tranlaste_hash['Y']= [['c','t'],0]
68
+ tranlaste_hash['H']= [['a','t','c'],0]
69
+ tranlaste_hash['B']= [['g','t','c'],0]
70
+ tranlaste_hash['D']= [['g','a','t'],0]
71
+ tranlaste_hash['V']= [['g','a','c'],0]
72
+ tranlaste_hash['N']= [['g','a','c','t'],0]
73
+
74
+ ########################################
75
+
76
+ fix_degenerated_fasta!(tranlaste_hash)
77
+
78
+
79
+ end
80
+
81
+ def fix_degenerated_fasta!(tranlaste_hash)
82
+ s = @seq_fasta
83
+ res = []
84
+
85
+ nts_of_a_line = s.split('')
86
+
87
+ nts_of_a_line.map{
88
+ |e|
89
+ # puts "#{e} "
90
+
91
+ if (e =~ /[RWMKSYHBDVN]/)
92
+
93
+ # puts "#{e} "
94
+ tranlaste_hash[e][1] += 1
95
+ # puts "#{e} #{tranlaste_hash[e][1]}"
96
+
97
+ e = tranlaste_hash[e][0][tranlaste_hash[e][1]%tranlaste_hash[e][0].length]
98
+
99
+ # puts "#{e}"
100
+ end
101
+
102
+ res.push e
103
+
104
+ }
105
+
106
+ @seq_fasta=res.compact.join
107
+ # @seq_fasta='dario'
108
+ end
109
+
110
+
111
+ end
@@ -0,0 +1,877 @@
1
+
2
+ require 'common_functions'
3
+ require 'scbi_plot'
4
+
5
+ include CommonFunctions
6
+
7
+ class TestCode
8
+
9
+ def initialize(seq)
10
+
11
+ name=''
12
+ t_code=''
13
+ status=''
14
+ ref_start=0
15
+ ref_end=seq.seq_fasta.length
16
+ ref_frame=''
17
+ orf=''
18
+ protein = ''
19
+ p_long = 0
20
+
21
+ if (seq.seq_fasta.length < 200)
22
+ ref_name = seq.seq_name
23
+ ref_code = 0.0
24
+ ref_frame = 0
25
+ ref_status = 'unknown'
26
+ ref_orf = ''
27
+ ref_msgs = 'Sequence length < 200 nt'
28
+
29
+ seq.annotate(:tcode,"#{ref_name}\t#{seq.seq_fasta.length}\t\ttestcode\t#{ref_status}\t#{ref_code}\t\t\t\t\t#{ref_msgs}\t#{ref_frame}\t#{ref_start}\t#{ref_end}\t\t\t\t",true)
30
+
31
+ else
32
+
33
+ # para probar tescode con toda la secuencia, en lugar de con los ORFs ----------------------------------------------------------------------
34
+ # sense_strand = seq.seq_fasta.upcase
35
+ # antisense_strand = sense_strand.complementary_dna
36
+ # (t_code,t_status) = testCode_exec(sense_strand)
37
+ # ref_frame = 1
38
+ # ref_msgs = ''
39
+ # (as_t_code,as_t_status) = testCode_exec(antisense_strand)
40
+ # # puts "#{seq.seq_name}: t_code: #{t_code}, t_status: #{t_status}, as_t_code: #{as_t_code}, as_t_status: #{as_t_status}"
41
+ # seq.annotate(:tcode,"#{seq.seq_name}\t#{seq.seq_fasta.length}\t\ttestcode\t#{t_status}\t#{t_code}\t\t\t\t\t#{ref_msgs}\t#{ref_frame}\t#{ref_start}\t#{ref_end}\t\t\t\t",true)
42
+ # --------------------------------------------------------------------------------------------------------------------------------
43
+
44
+
45
+ # see add_region filter
46
+ (name,t_code,status,ref_start,ref_end,ref_frame,orf,ref_msgs,stop_before_start,more_than_one_frame) = t_code(seq)
47
+ seq.annotate(:tcode,"#{name}\t#{seq.seq_fasta.length}\t\ttestcode\t#{status}\t#{t_code}\t\t\t\t\t#{ref_msgs}\t#{ref_frame}\t#{ref_start}\t#{ref_end}\t\t\t\t",true)
48
+
49
+ # if (ref_msgs.nil?)
50
+ # ref_msgs = ''
51
+ # end
52
+ #
53
+ # if (stop_before_start)
54
+ # ref_msgs += "There is a STOP codon before ATG. "
55
+ # end
56
+ #
57
+ # if (more_than_one_frame)
58
+ # ref_msgs += "Possible frame error by an ins/del"
59
+ # end
60
+ #
61
+ # if (status.to_s =~ /^[CN]\_terminus/) || (status == :Internal)
62
+ # tmp_status = "Putative #{status.to_s}"
63
+ # else
64
+ # tmp_status = status.to_s
65
+ # end
66
+ #
67
+ #
68
+ # if (!orf.nil?) && (!more_than_one_frame)
69
+ # protein = orf.translate
70
+ # p_long = protein.length - 3
71
+ # seq.annotate(:tcode,"#{name}\t#{seq.seq_fasta.length}\t\ttestcode\t#{tmp_status}\t#{t_code}\t\t\t#{p_long}\t\t#{ref_msgs}\t#{ref_frame}\t#{ref_start}\t#{ref_end}\t\t\t\t#{protein}",true)
72
+ # else
73
+ # seq.annotate(:tcode,"#{name}\t#{seq.seq_fasta.length}\t\ttestcode\t#{tmp_status}\t#{t_code}\t\t\t\t\t#{ref_msgs}\t#{ref_frame}\t#{ref_start}\t#{ref_end}\t\t\t\t",true)
74
+ # end
75
+
76
+ end
77
+ end
78
+
79
+ def t_code(seq)
80
+
81
+ name = seq.seq_name
82
+ tc_fasta = seq.seq_fasta
83
+
84
+ # generamos todos los ORFs de cada secuencia
85
+ uncomplete_orf_finder(seq)
86
+ minus_strand = []
87
+ plus_strand = []
88
+
89
+ # puts "**************************************************"
90
+ # puts "#{name} #{tc_fasta.length}"
91
+ # ordenamos los ORFs empezando desde 5' a 3' y luego separamos los de cada hebra en un array distinto
92
+ seq.orfs.sort{|x,y| x.t_start <=> y.t_start }.each do |one_orf|
93
+
94
+ (t_code,t_status)=testCode_exec(one_orf.seq)
95
+ one_orf.status = t_status
96
+ one_orf.score = t_code
97
+
98
+ if (one_orf.frame < 0)
99
+ minus_strand.push one_orf
100
+ elsif (one_orf.frame > 0)
101
+ plus_strand.push one_orf
102
+ end
103
+
104
+ end
105
+ #----------------------------------- Plus strand
106
+ if (!plus_strand.empty?)
107
+ # puts "--------------Plus strand:"
108
+ best_plus_region = compare_regions(plus_strand,tc_fasta)
109
+ if (!best_plus_region.nil?)
110
+ # puts "#{best_plus_region.seq.length}, #{best_plus_region.status}, #{best_plus_region.type}, #{best_plus_region.t_start} - #{best_plus_region.t_end}, #{best_plus_region.frame}"
111
+ # puts best_plus_region.seq
112
+ end
113
+ end
114
+ #----------------------------------- Minus strand
115
+ if (!minus_strand.empty?)
116
+ # puts "--------------Minus strand:"
117
+ best_minus_region = compare_regions(minus_strand,tc_fasta)
118
+ if (!best_minus_region.nil?)
119
+ # puts "#{best_minus_region.seq.length}, #{best_minus_region.status}, #{best_minus_region.type}, #{best_minus_region.t_start} - #{best_minus_region.t_end}, #{best_minus_region.frame}"
120
+ # puts best_minus_region.seq
121
+ end
122
+ end
123
+ # obtenemos la region codificante mas larga de ambas hebras
124
+ best_region = nil
125
+ if (!best_plus_region.nil?)
126
+ if (!best_minus_region.nil?)
127
+ if (best_minus_region.seq.length > best_plus_region.seq.length)
128
+ best_region = best_minus_region
129
+ else
130
+ best_region = best_plus_region
131
+ end
132
+ else
133
+ best_region = best_plus_region
134
+ end
135
+ elsif (!best_minus_region.nil?)
136
+ best_region = best_minus_region
137
+ end
138
+ # puts "--------------Best region:"
139
+ # puts " --------------------------------- #{best_region.seq.length}, #{best_region.status}, #{best_region.type}, #{best_region.t_start} - #{best_region.t_end}, #{best_region.frame}"
140
+ # comprobamos el tipo de ORF segun si tiene un codon de parada antes del atg
141
+
142
+ if (!best_region.nil?) && (best_region.seq.length >= 200)
143
+ if (best_region.type == :Complete) && (!best_region.stop_codon)
144
+ best_region.type = 'Putative Complete'
145
+ ref_msgs = 'NO STOP codon before ATG. '
146
+ elsif (best_region.type == :N_terminus) && (!best_region.stop_codon)
147
+ ref_msgs = 'NO STOP codon before ATG. '
148
+ end
149
+ return [name, best_region.score, best_region.type, best_region.t_start, best_region.t_end, best_region.frame, best_region.seq, ref_msgs, best_region.stop_codon, best_region.more_than_one_frame]
150
+ else
151
+ ref_score = 0.0
152
+ ref_start = 0
153
+ ref_end = 0
154
+ ref_frame = 0
155
+ ref_orf = ''
156
+ ref_type = 'unknown'
157
+ ref_msgs = 'Non coding ORF found >= 200 nt '
158
+ return [name, ref_score, ref_type, ref_start, ref_end, ref_frame, ref_orf, ref_msgs, false, false]
159
+ end
160
+ end
161
+
162
+ # cuando se unen dos regiones de diferentes frames, nos dice de que tipo es la union, Complete, Internal, N-terminus...
163
+ def type_fusion(prev_type,one_type)
164
+ res_type = :Internal
165
+ if (prev_type == :C_terminus) #-------------- C-terminus
166
+ if (one_type == :N_terminus)
167
+ res_type = :Internal
168
+ elsif (one_type == :Internal)
169
+ res_type = :Internal
170
+ elsif (one_type == :Complete)
171
+ res_type = :C_terminus
172
+ end
173
+ elsif (prev_type == :N_terminus) #-------------- N-terminus
174
+ if (one_type == :C_terminus)
175
+ res_type = :Complete
176
+ elsif (one_type == :Internal)
177
+ res_type = :N_terminus
178
+ elsif (one_type == :Complete)
179
+ res_type = :Complete
180
+ end
181
+ elsif (prev_type == :Internal) #-------------- Internal
182
+ if (one_type == :C_terminus)
183
+ res_type = :C_terminus
184
+ elsif (one_type == :N_terminus)
185
+ res_type = :Internal
186
+ elsif (one_type == :Complete)
187
+ res_type = :C_terminus
188
+ end
189
+ elsif (prev_type == :Complete) #-------------- Complete
190
+ if (one_type == :C_terminus)
191
+ res_type = :Complete
192
+ elsif (one_type == :N_terminus)
193
+ res_type = :N_terminus
194
+ elsif (one_type == :Internal)
195
+ res_type = :N_terminus
196
+ end
197
+ end
198
+ return res_type
199
+ end
200
+
201
+ # para escoger la region codificante mas grande, incluso solapando verios frames
202
+ def compare_regions(this_strand,tc_fasta)
203
+
204
+ # select the largest ORF without frame repair-------------------------------- hay que comentar todo el bloque de arriba
205
+ best_orf = nil
206
+ best_orf = this_strand.sort{|x,y| y.seq.length <=> x.seq.length }[0]
207
+
208
+ if !best_orf.nil?
209
+ best_orf.type = best_orf.status
210
+ end
211
+
212
+ return best_orf
213
+
214
+ end
215
+
216
+ # to add regions over a determinated size
217
+ def add_region(orf_seq, orf_t_start, orf_t_end, orf_frame, orf_stop_codon, orf_type,seq)
218
+
219
+ if (orf_seq.length >= 200)
220
+ # puts "#{seq.seq_name}, #{orf_t_start} - #{orf_t_end}, #{orf_frame}, #{orf_type}"
221
+ seq.add_orf(orf_seq, orf_t_start, orf_t_end, orf_frame, orf_stop_codon, orf_type)
222
+ end
223
+
224
+ end
225
+
226
+ # se buscan las regiones Complete, N-terminus, C-terminus e Internal en la secuencia de cada frame
227
+ def generate_uncomplete_orf(a,frame,seq)
228
+
229
+ my_atg = true
230
+ atg_codon = false
231
+ stop_codon = false
232
+ any_stop = false
233
+
234
+ orf =''
235
+ t_start = 0
236
+ t_end = 0
237
+
238
+ a.each do |e|
239
+ t_end += 3
240
+ orf += e
241
+
242
+ if (e == 'ATG') && (!atg_codon)
243
+ atg_codon = true
244
+ stop_codon = false
245
+ t_start = t_end - 3
246
+ elsif (e == 'TAG') || (e == 'TGA') || (e == 'TAA')
247
+ orf_tmp = orf[t_start..t_end]
248
+ if (any_stop)
249
+ # case 1, complete orf
250
+ if (atg_codon)
251
+ orf_tmp = orf[t_start..t_end]
252
+ add_region(orf_tmp, t_start, t_end, frame, any_stop, :Complete,seq)
253
+ end
254
+ else
255
+ # case 2, C_terminus
256
+ if (my_atg)
257
+ orf_tmp = orf[0..t_end]
258
+ add_region(orf_tmp, 0, t_end, frame, any_stop, :C_terminus,seq)
259
+ # case 3, putative complete, complete without stop codon before atg
260
+ if (atg_codon)
261
+ orf_tmp = orf[t_start..t_end]
262
+ add_region(orf_tmp, t_start, t_end, frame, any_stop, :Complete, seq)
263
+ end
264
+ end
265
+ end
266
+
267
+ stop_codon = true
268
+ any_stop = true
269
+ my_atg = false
270
+ atg_codon = false
271
+ t_start += 3
272
+ end
273
+ end
274
+
275
+ # case 4, N_terminus and case 6, putative N_terminus
276
+ if (atg_codon) && (!stop_codon)
277
+ orf_tmp = orf[t_start..t_end]
278
+ add_region(orf_tmp, t_start, t_end, frame, any_stop, :N_terminus,seq)
279
+ end
280
+
281
+ # case 5, internal
282
+ if (my_atg) && (!any_stop)
283
+ orf_tmp = orf[0..t_end]
284
+ add_region(orf_tmp, 0, t_end, frame, any_stop, :Internal,seq)
285
+ end
286
+
287
+ end
288
+
289
+ # recorre cada uno de los frames y los pasa a generate_uncomplete_orf
290
+ def uncomplete_orf_finder(seq)
291
+
292
+ s = seq.seq_fasta.upcase
293
+ f1 = s.split('').each_slice(3).map{|e| e.join}
294
+ generate_uncomplete_orf(f1,1,seq)
295
+
296
+ s.sub!(/^./,'')
297
+ f2 = s.split('').each_slice(3).map{|e| e.join}
298
+ generate_uncomplete_orf(f2,2,seq)
299
+
300
+ s.sub!(/^./,'')
301
+ f3 = s.split('').each_slice(3).map{|e| e.join}
302
+ generate_uncomplete_orf(f3,3,seq)
303
+
304
+ # vamos a por los ORFs de la cadena complementaria
305
+ s = seq.seq_fasta.upcase
306
+ s = s.complementary_dna
307
+
308
+ f4 = s.split('').each_slice(3).map{|e| e.join}
309
+ generate_uncomplete_orf(f4,-1,seq)
310
+
311
+ s.sub!(/^./,'')
312
+ f5 = s.split('').each_slice(3).map{|e| e.join}
313
+ generate_uncomplete_orf(f5,-2,seq)
314
+
315
+ s.sub!(/^./,'')
316
+ f6 = s.split('').each_slice(3).map{|e| e.join}
317
+ generate_uncomplete_orf(f6,-3,seq)
318
+
319
+ end
320
+
321
+
322
+
323
+
324
+
325
+
326
+
327
+
328
+ # | = ATG
329
+ # * = STOP
330
+ #
331
+ # -------*---|>>>>>>>>>>>>>> 1 >>>>>>>>>>>>*-------------
332
+ # >>>>>>>>>> 2 >>>>>>>>>>>>>*----------------------------
333
+ # ---|>>>>>>>> 3 >>>>>>>>>>>>*---------------------------
334
+ # -------------------*-----------------|>>>>>>>>>> 4 >>>>
335
+ # >>>>>>>>>>>>>>>>>>>>>>>>>> 5 >>>>>>>>>>>>>>>>>>>>>>>>>>
336
+ #--------------------------- 6 ----------|>>>>>>>>>>>>>>>
337
+ #
338
+ # 1 complete orf, with stop codon before atg
339
+ # 2 C-terminus
340
+ # 3 putative complete, complete without stop codon before atg
341
+ # 4 N_terminus, with stop codon before atg
342
+ # 5 internal
343
+ # 6 putative N_terminus, N_terminus without stop codon before atg
344
+
345
+ # no se usa
346
+ def orf_fusion(orfs_array,tc_fasta,name,ref_status,ref_code,ref_start,ref_end,ref_frame,ref_orf,ref_msgs)
347
+
348
+ lower_start = 9999
349
+ higher_end = 0
350
+ new_orf = false
351
+ ref_name = name
352
+
353
+ # vamos a coger el mejor de referencia (el coding mas largo)
354
+ # y vamos a poner en el warning los coding que estan en el mismo sentido y no estan contenidos en el mejor
355
+ orfs_array.sort! {|orf1,orf2| orf1[1] <=> orf2[1]}
356
+
357
+ tmp_orf = ref_orf
358
+ tmp_start = ref_start
359
+ tmp_end = ref_end
360
+ tmp_frame = ref_frame
361
+ tmp_msg = ''
362
+
363
+ # puts "\n\n#{name} ---- tmp_frame: #{tmp_frame} ------------------------ \n\n"
364
+
365
+ orfs_array.each do |orf|
366
+
367
+ (orf[1],orf[2]) = corrige_frame(orf[3],orf[1],orf[2])
368
+ # (orf[1],orf[2]) = $an.corrige_frame(orf[3],orf[1],orf[2])
369
+
370
+ # puts "*** name: #{name}, orf[1]: #{orf[1]}, orf[2]: #{orf[2]}, orf[3]: #{orf[3]} ref_start: #{ref_start}, ref_end: #{ref_end}, ref_frame: #{ref_frame} ***\n\n"
371
+
372
+ if (orf[0] != tmp_orf)
373
+
374
+ if ((tmp_end >= orf[1]) && (tmp_end <= orf[2])) || ((tmp_start >= orf[1]) && (tmp_start <= orf[2])) # los ORFs solapan
375
+
376
+ # puts "SOLAPAN frame: #{orf[3]} tmp_start: #{tmp_start} tmp_end: #{tmp_end} orf_start: #{orf[1]} orf_end: #{orf[2]}"
377
+ if (tmp_frame > 0)
378
+ tmp_msg = ", overlapping coding region (#{orf[1]},#{orf[2]})"
379
+ elsif (tmp_frame < 0)
380
+ tmp_msg = ", overlapping coding region (-#{orf[1]},-#{orf[2]})"
381
+ end
382
+ new_orf = true
383
+
384
+ elsif (tmp_end < orf[1]) || (tmp_start > orf[2]) # los ORFs estan separados
385
+
386
+ # puts "#{name} frame: #{orf[3]} SEPARADOS --> tmp_start: #{tmp_start} tmp_end: #{tmp_end} orf_start: #{orf[1]} orf_end: #{orf[2]}"
387
+ if (tmp_frame > 0)
388
+ tmp_msg = ", other coding region (#{orf[1]},#{orf[2]})"
389
+ elsif (tmp_frame < 0)
390
+ tmp_msg = ", other coding region (-#{orf[1]},-#{orf[2]})"
391
+ end
392
+ new_orf = true
393
+
394
+ end
395
+
396
+ if (new_orf == true)
397
+
398
+ if (orf[1] < lower_start)
399
+ lower_start = orf[1]
400
+ end
401
+
402
+ # if (orf[2] > higher_end)
403
+ # end
404
+
405
+ (tmp_code,tmp_status)=testCode_exec(tc_fasta[lower_start-1..higher_end-1])
406
+
407
+ if (tmp_status != 'unknown')
408
+ # puts "#{name} ----------------------------------- FUSION!!!!!!!!!!!!!!!!\n\n"
409
+ # tenemos varios ORFs q son codificantes al unirlos
410
+ ref_msgs += tmp_msg
411
+ new_orf = false
412
+ else
413
+ # puts "#{name} ------------------------------------NO se unen\n\n"
414
+ end
415
+ end
416
+ end
417
+ end
418
+ return[ref_status,ref_code,ref_name,ref_start,ref_end,ref_frame,ref_orf,ref_msgs]
419
+ end
420
+ # no se usa
421
+ def t_code_old(seq)
422
+
423
+ ref_code = 0.0
424
+ ref_name = ''
425
+ ref_start = 0
426
+ ref_end = 0
427
+ ref_frame = 0
428
+ ref_status = ''
429
+ ref_orf = ''
430
+ ref_msgs = ''
431
+
432
+ name = seq.seq_name
433
+ tc_fasta = seq.seq_fasta
434
+
435
+ # generamos todos los ORFs mayores de 200pb de cada secuencia, en fl_string_utils
436
+ orfs_array = tc_fasta.orf_finder
437
+
438
+ if (orfs_array[0].nil?)
439
+ ref_name = name
440
+ ref_code = 0.0
441
+ ref_start = 0
442
+ ref_end = 0
443
+ ref_frame = 0
444
+ ref_status = 'unknown'
445
+ ref_orf = ''
446
+ ref_msgs = 'ORF length < 200 nt'
447
+ # ref_msgs = 'Your sequence has not an ORF longer than 200 nt'
448
+
449
+ else
450
+ one_good_orf_minus = false
451
+ more_than_one_minus = false
452
+ one_good_orf_plus = false
453
+ more_than_one_plus = false
454
+
455
+ good_orfs_minus = []
456
+ good_orfs_plus = []
457
+
458
+ one_coding = false
459
+ one_putative = false
460
+
461
+ # orfs_array.sort! {|orf1,orf2| (orf1[2] - orf1[1]) <=> (orf2[2] - orf2[1])}
462
+ orfs_array.each do |orf|
463
+
464
+ # long = orf.length - 1
465
+ if (orf[0])
466
+ # if (long >= 200)
467
+ (t_code,t_status)=testCode_exec(orf[0])
468
+ # puts "name: #{name},t_status: #{t_status}, t_code: #{t_code}, stop_codon: #{orf[4]}, length: #{tc_fasta.length}, start: #{orf[1]}, end: #{orf[2]}, frame: #{orf[3]}\n\n"
469
+
470
+ if (t_status != 'unknown')
471
+ if (orf[3].to_i < 0)
472
+ if (one_good_orf_minus == true)
473
+ more_than_one_minus = true
474
+ orf.push t_code
475
+ orf.push t_status
476
+ good_orfs_minus.push orf
477
+ else
478
+ one_good_orf_minus = true
479
+ orf.push t_code #orf[5]
480
+ orf.push t_status #orf[6]
481
+ good_orfs_minus.push orf
482
+ end
483
+ elsif (orf[3].to_i > 0)
484
+ if (one_good_orf_plus == true)
485
+ more_than_one_plus = true
486
+ orf.push t_code
487
+ orf.push t_status
488
+ good_orfs_plus.push orf
489
+ else
490
+ one_good_orf_plus = true
491
+ orf.push t_code #orf[5]
492
+ orf.push t_status #orf[6]
493
+ good_orfs_plus.push orf
494
+ end
495
+ end
496
+ end
497
+
498
+ # if (t_code.to_f > ref_code) # cogemos el de mejor testcode
499
+ # puts "name: #{name}, orf[0].length: #{orf[0].length}, ref_orf.length: #{ref_orf.length}, t_status: #{t_status}, one_coding: #{one_coding}\n\n"
500
+ if (orf[0].length > ref_orf.length) and (t_status == 'coding') # cogemos el mayor de los coding
501
+ # puts "compleeeeeeeeeeeeeeetttttttttttaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"
502
+ ref_code = t_code.to_f
503
+ ref_name = name
504
+ ref_orf = orf[0]
505
+ ref_start = orf[1]
506
+ ref_end = orf[2]
507
+ ref_frame = orf[3]
508
+
509
+ one_coding = true
510
+ (ref_status,ref_msgs) = checking_beginning(orf[4],t_status,orf[1],tc_fasta,orf[2])
511
+
512
+ elsif (one_coding == false) and (orf[0].length > ref_orf.length) and (t_status == 'putative_coding')
513
+ # puts "putaaaaaaaaaaaaaaaaaaativeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeee"
514
+ ref_code = t_code.to_f
515
+ ref_name = name
516
+ ref_orf = orf[0]
517
+ ref_start = orf[1]
518
+ ref_end = orf[2]
519
+ ref_frame = orf[3]
520
+
521
+ one_putative = true
522
+ (ref_status,ref_msgs) = checking_beginning(orf[4],t_status,orf[1],tc_fasta,orf[2])
523
+
524
+ elsif (one_coding == false) and (one_putative == false)
525
+ # puts "unknoooooooooooooooooooooooooooooooooooooooooooooooooooooooooooown"
526
+ ref_status = t_status
527
+ ref_code = t_code.to_f
528
+ ref_name = name
529
+ ref_orf = orf[0]
530
+ ref_start = orf[1]
531
+ ref_end = orf[2]
532
+ ref_frame = orf[3]
533
+
534
+ ref_msgs = 'Bad testcode score'
535
+ # ref_msgs = 'test code did not find a coding region in your sequence'
536
+ end
537
+ # puts "**name: #{name},ref_status: #{ref_status}, ref_code: #{ref_code}, length: #{tc_fasta.length}, ref_start: #{ref_start}, ref_end: #{ref_end}, ref_frame: #{ref_frame}\n\n"
538
+ # puts "t_code: #{t_code}, t_status: #{t_status}"
539
+ end
540
+ end
541
+
542
+ # vamos a preparar el mejor ORF y devolverlo
543
+ (ref_start,ref_end) = corrige_frame(ref_frame,ref_start,ref_end)
544
+ # puts "**name: #{name}, ref_start: #{ref_start}, ref_end: #{ref_end}, ref_frame: #{ref_frame}\n\n"
545
+
546
+ # si encontramos más de un orf valido
547
+ if (more_than_one_plus) or (more_than_one_minus)
548
+ if (ref_frame > 0)
549
+ if (more_than_one_plus)
550
+ (ref_status,ref_code,ref_name,ref_start,ref_end,ref_frame,ref_orf,ref_msgs) = orf_fusion(good_orfs_plus,tc_fasta,name,ref_status,ref_code,ref_start,ref_end,ref_frame,ref_orf,ref_msgs)
551
+ # if (ref_frame == 0)
552
+ # ref_frame = 7
553
+ # end
554
+ # puts "merging ORFs with orf_fusion!!!!"
555
+ end
556
+ elsif (ref_frame < 0)
557
+ if (more_than_one_minus)
558
+ (ref_status,ref_code,ref_name,ref_start,ref_end,ref_frame,ref_orf,ref_msgs) = orf_fusion(good_orfs_minus,tc_fasta,name,ref_status,ref_code,ref_start,ref_end,ref_frame,ref_orf,ref_msgs)
559
+ # if (ref_frame == 0)
560
+ # ref_frame = -7
561
+ # end
562
+ # puts "merging ORFs with orf_fusion!!!!"
563
+ end
564
+ end
565
+ end
566
+
567
+ # puts "name: #{name},t_status: #{ref_status}, t_code: #{ref_code}, frame: #{ref_frame},start: #{ref_start}, end: #{ref_end}\n\n"
568
+
569
+ if (ref_frame < 0)
570
+ (kk1,kk2,ref_start2,ref_end2) = reverse_seq(tc_fasta,ref_frame,ref_start,ref_end)
571
+
572
+ ref_start_ok = "#{ref_start2} (-#{ref_start})"
573
+ ref_end_ok = "#{ref_end2} (-#{ref_end})"
574
+ else
575
+ ref_start_ok = ref_start
576
+ ref_end_ok = ref_end
577
+ end
578
+ end
579
+
580
+ return [ref_name,ref_code,ref_status,ref_start_ok,ref_end_ok,ref_frame,ref_orf,ref_msgs]
581
+
582
+ end
583
+ # no se usa
584
+ # gnuplot must be installed
585
+ def window_walking(seq)
586
+
587
+ tcode_array = []
588
+ y1=[]
589
+ y2=[]
590
+
591
+ s = seq.seq_fasta.upcase
592
+ (s.length-200).times do |i|
593
+ (t_code,t_status)=testCode_exec(s[i..i+199])
594
+ y1.push t_code
595
+ end
596
+
597
+ tcode_array_rc = []
598
+
599
+ src = s.complementary_dna
600
+ (src.length-200).times do |i|
601
+ (t_code,t_status)=testCode_exec(src[i..i+199])
602
+ # puts "#{i}-#{i+199}"
603
+ # puts t_status
604
+ # puts t_code
605
+ puts src[i..i+199]
606
+ y2.push t_code
607
+ end
608
+
609
+ # Create lines plot
610
+ p=ScbiPlot::Lines.new('lines.png','title')
611
+ x=(1..src.length-200).entries
612
+ p.add_x(x)
613
+
614
+ # puts "x_length: #{src.length-200} #{x}"
615
+ # puts "x: #{x.length}, y: #{y1.length}"
616
+ p.add_series('serie0', y1)
617
+
618
+ p.do_graph
619
+
620
+ end
621
+ # no se usa
622
+ # para escoger la region codificante mas grande, incluso solapando verios frames
623
+ def compare_regions_old(this_strand,tc_fasta)
624
+
625
+ # array_regions = []
626
+ # array_regions.push this_strand.first.dup
627
+ #
628
+ # this_strand.each do |one_orf|
629
+ #
630
+ # prev_orf = array_regions.last.dup # taking last orf before add actual orf
631
+ # array_regions.push one_orf.dup # add actual orf, el primer orf me va a salir duplicado
632
+ #
633
+ # if prev_orf.overlaps?(one_orf)
634
+ # prev_orf.seq = tc_fasta[prev_orf.t_start..one_orf.t_end]
635
+ #
636
+ # (t_code,t_status)=testCode_exec(prev_orf.seq)
637
+ #
638
+ # prev_orf.score = t_code
639
+ # prev_orf.status = t_status
640
+ #
641
+ # #t_start, frame and stop_codon are the same
642
+ # prev_orf.t_end = one_orf.t_end
643
+ # if (prev_orf.type != one_orf.type)
644
+ # prev_orf.type = type_fusion(prev_orf.type,one_orf.type)
645
+ # end
646
+ # prev_orf.more_than_one_frame = true
647
+ #
648
+ # # puts "overlaps:"
649
+ # # puts "#{one_orf.t_start} - #{one_orf.t_end}"
650
+ # # puts "#{array_regions.last.t_start} - #{array_regions.last.t_end}"
651
+ #
652
+ # array_regions.push prev_orf # add overlapped orf,
653
+ # end
654
+ #
655
+ # end
656
+ #
657
+ # # select the largest ORF without frame repair-------------------------------- hay que comentar todo el bloque de arriba
658
+ # # best_orf = nil
659
+ # # best_orf = this_strand.sort{|x,y| y.seq.length <=> x.seq.length }[0]
660
+ # # ---------------------------------------------------------------------------
661
+ #
662
+ # # select the largest ORF ----------------------------------------------------
663
+ # # best_orf = nil
664
+ # # best_orf = array_regions.sort{|x,y| y.seq.length <=> x.seq.length }[0]
665
+ # # if !best_orf.nil?
666
+ # # puts "best_orf.status: #{best_orf.status}, best_orf.type: #{best_orf.type}"
667
+ # # end
668
+ #
669
+ #
670
+ # # ---------------------------------------------------------------------------
671
+ #
672
+ # # select the largest coding ORF ---------------------------------------------
673
+ # best_orf = nil
674
+ # array_regions.sort{|x,y| y.seq.length <=> x.seq.length }.each do |one_orf|
675
+ # # puts "#{one_orf.status}, #{one_orf.type}"
676
+ # if (one_orf.status == :coding)
677
+ # best_orf = one_orf
678
+ # break
679
+ # end
680
+ # end
681
+ # # ---------------------------------------------------------------------------
682
+ #
683
+ # # puts "\n\n\nstart"
684
+ # # array_regions.sort{|x,y| y.seq.length <=> x.seq.length }.each do |one_orf|
685
+ # # puts one_orf.seq.length
686
+ # # end
687
+ # # puts "end\n\n\n"
688
+ # # puts "best_orf: #{best_orf.seq.length}"
689
+ #
690
+ # # if !best_orf.nil?
691
+ # # best_orf.type = best_orf.status
692
+ # # end
693
+ #
694
+ # return best_orf
695
+
696
+ end
697
+
698
+ ###### test code program functions #########
699
+
700
+ def testCode_exec (sequence)
701
+
702
+ sequence.downcase!
703
+ basesOne = [0,0,0,0];
704
+ basesTwo = [0,0,0,0];
705
+ basesThree = [0,0,0,0];
706
+
707
+ #for (j = 0; j < sequence.length; j = j + 3)
708
+
709
+ 0.step(sequence.length-1,3) do |j|
710
+ if (sequence[j].chr == "g")
711
+ basesOne[0] = basesOne[0] + 1;
712
+ elsif (sequence[j].chr == "a")
713
+ basesOne[1] = basesOne[1] + 1;
714
+ elsif (sequence[j].chr == "t")
715
+ basesOne[2] = basesOne[2] + 1;
716
+ elsif (sequence[j].chr == "c")
717
+ basesOne[3] = basesOne[3] + 1;
718
+ else
719
+ end
720
+ end
721
+
722
+ #for (j = 1; j < sequence.length; j = j + 3)
723
+ 1.step(sequence.length-1,3) do |j|
724
+ if (sequence[j].chr == "g")
725
+ basesTwo[0] = basesTwo[0] + 1;
726
+ elsif (sequence[j].chr == "a")
727
+ basesTwo[1] = basesTwo[1] + 1;
728
+ elsif (sequence[j].chr == "t")
729
+ basesTwo[2] = basesTwo[2] + 1;
730
+ elsif (sequence[j].chr == "c")
731
+ basesTwo[3] = basesTwo[3] + 1;
732
+ else
733
+ end
734
+ end
735
+
736
+
737
+ #for (j = 2; j < sequence.length; j = j + 3)
738
+ 2.step(sequence.length-1,3) do |j|
739
+ if (sequence[j].chr == "g")
740
+ basesThree[0] = basesThree[0] + 1;
741
+ elsif (sequence[j].chr == "a")
742
+ basesThree[1] = basesThree[1] + 1;
743
+ elsif (sequence[j].chr == "t")
744
+ basesThree[2] = basesThree[2] + 1;
745
+ elsif (sequence[j].chr == "c")
746
+ basesThree[3] = basesThree[3] + 1;
747
+ else
748
+ end
749
+ end
750
+
751
+ paramG = calcParam(basesOne[0],basesTwo[0],basesThree[0]);
752
+ contentG = countBases(basesOne[0],basesTwo[0],basesThree[0]) / sequence.length.to_f;
753
+ posProbG = usePosParam(paramG,"g");
754
+ contProbG = useContParam(contentG,"g");
755
+ paramA = calcParam(basesOne[1],basesTwo[1],basesThree[1]);
756
+ contentA = countBases(basesOne[1],basesTwo[1],basesThree[1]) / sequence.length.to_f;
757
+ posProbA = usePosParam(paramA,"a");
758
+ contProbA = useContParam(contentA,"a");
759
+ paramT = calcParam(basesOne[2],basesTwo[2],basesThree[2]);
760
+ contentT = countBases(basesOne[2],basesTwo[2],basesThree[2]) / sequence.length.to_f;
761
+ posProbT = usePosParam(paramT,"t");
762
+ contProbT = useContParam(contentT,"t");
763
+ paramC = calcParam(basesOne[3],basesTwo[3],basesThree[3]);
764
+ contentC = countBases(basesOne[3],basesTwo[3],basesThree[3]) / sequence.length.to_f;
765
+ posProbC = usePosParam(paramC,"c");
766
+ contProbC = useContParam(contentC,"c");
767
+ valueY = posProbG * 0.31 + contProbG * 0.15 + posProbA * 0.26 + contProbA * 0.11 + posProbT * 0.33 + contProbT * 0.14 + posProbC * 0.18 + contProbC * 0.12;
768
+ valueY = ((valueY*1000.0).round/1000.0);
769
+
770
+ # return 'The TestCode value is <b>' + valueY.to_s + '</b>, which indicates that the sequence ' + getConclusion(valueY) + '.';
771
+ return [valueY.to_s, getConclusion(valueY)]
772
+ end
773
+
774
+ def calcParam (valueOne,valueTwo,valueThree)
775
+ paramArray = [valueOne,valueTwo,valueThree];
776
+ paramArray = paramArray.sort#{|a,b| return a-b}#(compareNumbers);
777
+ paramValue = paramArray[2] / (paramArray[0] + 1.0);
778
+ # puts paramArray.to_json
779
+ return paramValue;
780
+ end
781
+
782
+ def countBases (valueOne,valueTwo,valueThree)
783
+ return valueOne + valueTwo + valueThree;
784
+ end
785
+
786
+ def usePosParam (paramValue,base)
787
+ arrayOfCodingProb = [];
788
+ codeProb = 0;
789
+ if (base == "g")
790
+ arrayOfCodingProb = [0.08,0.08,0.16,0.27,0.48,0.53,0.64,0.74,0.88,0.90]
791
+ elsif (base == "a")
792
+ arrayOfCodingProb = [0.22,0.20,0.34,0.45,0.68,0.58,0.93,0.84,0.68,0.94]
793
+ elsif (base == "t")
794
+ arrayOfCodingProb = [0.09,0.09,0.20,0.54,0.44,0.69,0.68,0.91,0.97,0.97]
795
+ elsif (base == "c")
796
+ arrayOfCodingProb = [0.23,0.30,0.33,0.51,0.48,0.66,0.81,0.70,0.70,0.80]
797
+ end
798
+
799
+
800
+ if (paramValue >= 0 and paramValue < 1.1)
801
+ codeProb = arrayOfCodingProb[0];
802
+ elsif (paramValue >=1.1 and paramValue < 1.2)
803
+ codeProb = arrayOfCodingProb[1];
804
+ elsif (paramValue >=1.2 and paramValue < 1.3)
805
+ codeProb = arrayOfCodingProb[2];
806
+ elsif (paramValue >=1.3 and paramValue < 1.4)
807
+ codeProb = arrayOfCodingProb[3];
808
+ elsif (paramValue >=1.4 and paramValue < 1.5)
809
+ codeProb = arrayOfCodingProb[4];
810
+ elsif (paramValue >=1.5 and paramValue < 1.6)
811
+ codeProb = arrayOfCodingProb[5];
812
+ elsif (paramValue >=1.6 and paramValue < 1.7)
813
+ codeProb = arrayOfCodingProb[6];
814
+ elsif (paramValue >=1.7 and paramValue < 1.8)
815
+ codeProb = arrayOfCodingProb[7];
816
+ elsif (paramValue >=1.8 and paramValue < 1.9)
817
+ codeProb = arrayOfCodingProb[8];
818
+ elsif (paramValue >=1.9)
819
+ codeProb = arrayOfCodingProb[9];
820
+ end
821
+
822
+ return codeProb;
823
+ end
824
+
825
+ def useContParam (paramValue,base)
826
+ arrayOfCodingProb = [];
827
+ codeProb = 0;
828
+ if (base == "g")
829
+ arrayOfCodingProb = [0.29,0.33,0.41,0.41,0.73,0.64,0.64,0.47,0.54,0.40]
830
+ elsif (base == "a")
831
+ arrayOfCodingProb = [0.21,0.81,0.65,0.67,0.49,0.62,0.55,0.44,0.49,0.28]
832
+ elsif (base == "t")
833
+ arrayOfCodingProb = [0.58,0.51,0.69,0.56,0.75,0.55,0.40,0.39,0.24,0.28]
834
+ elsif (base == "c")
835
+ arrayOfCodingProb = [0.31,0.39,0.44,0.43,0.59,0.59,0.64,0.51,0.64,0.82]
836
+ end
837
+
838
+ if (paramValue >= 0 and paramValue < 0.17)
839
+ codeProb = arrayOfCodingProb[0];
840
+ elsif (paramValue >=0.17 and paramValue < 0.19)
841
+ codeProb = arrayOfCodingProb[1];
842
+ elsif (paramValue >=0.19 and paramValue < 0.21)
843
+ codeProb = arrayOfCodingProb[2];
844
+ elsif (paramValue >=0.21 and paramValue < 0.23)
845
+ codeProb = arrayOfCodingProb[3];
846
+ elsif (paramValue >=0.23 and paramValue < 0.25)
847
+ codeProb = arrayOfCodingProb[4];
848
+ elsif (paramValue >=0.25 and paramValue < 0.27)
849
+ codeProb = arrayOfCodingProb[5];
850
+ elsif (paramValue >=0.27 and paramValue < 0.29)
851
+ codeProb = arrayOfCodingProb[6];
852
+ elsif (paramValue >=0.29 and paramValue < 0.31)
853
+ codeProb = arrayOfCodingProb[7];
854
+ elsif (paramValue >=0.31 and paramValue < 0.33)
855
+ codeProb = arrayOfCodingProb[8];
856
+ elsif (paramValue >=0.33)
857
+ codeProb = arrayOfCodingProb[9];
858
+ end
859
+
860
+ return codeProb;
861
+ end
862
+
863
+ def getConclusion (testCode_value)
864
+ codeProb = "";
865
+ if (testCode_value < 0.74)
866
+ codeProb = :unknown;
867
+ elsif (testCode_value >=0.74 and testCode_value < 0.95)
868
+ codeProb = :putative_coding;
869
+ elsif (testCode_value >=0.95)
870
+ codeProb = :coding;
871
+ end
872
+
873
+ return codeProb;
874
+ end
875
+
876
+
877
+ end