full_lengther_next 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,32 @@
1
+
2
+ class Orf
3
+
4
+ attr_accessor :seq,:t_start,:t_end,:frame,:stop_codon,:type,:status,:score, :more_than_one_frame
5
+
6
+ def initialize(orf_seq, t_start, t_end, frame, stop_codon, type)
7
+ @seq=orf_seq
8
+ @t_start=t_start
9
+ @t_end=t_end
10
+ @frame=frame
11
+ @stop_codon=stop_codon
12
+ @type=type # :N_terminus,:C_terminus,:Complete,:Internal,:Putative_Complete,:Putative_N_terminus
13
+ @status = :unknown # :unknown,:putative_coding,:coding
14
+ @score = 0
15
+ @more_than_one_frame = false
16
+ end
17
+
18
+ def overlaps?(other_orf)
19
+ overlap_status = false
20
+ i1 = self.t_start
21
+ i2 = other_orf.t_start
22
+ e1 = self.t_end
23
+ e2 = other_orf.t_end
24
+
25
+ if (e1 > i2) && (e1 < e2)
26
+ overlap_status = true
27
+ end
28
+
29
+ return overlap_status
30
+ end
31
+
32
+ end
@@ -0,0 +1,111 @@
1
+
2
+ require 'orf'
3
+
4
+ class Sequence
5
+
6
+ attr_accessor :seq_name,:seq_fasta,:seq_qual,:orfs,:sec_desc
7
+
8
+ def initialize(seq_name,seq_fasta,seq_qual='')
9
+ @seq_name=seq_name
10
+ @seq_fasta = seq_fasta
11
+ change_degenerated_nt!
12
+ @seq_qual = ''
13
+ @sec_desc = ''
14
+ @annotations=[]
15
+ @orfs=[]
16
+
17
+ @rejected=false
18
+ @rejected_message=''
19
+
20
+ end
21
+
22
+ def add_orf(orf_seq, orf_t_start, orf_t_end, orf_frame, orf_stop_codon, orf_type)
23
+ orf = Orf.new(orf_seq, orf_t_start, orf_t_end, orf_frame, orf_stop_codon, orf_type)
24
+ @orfs.push orf
25
+
26
+ end
27
+
28
+ def rejected?
29
+ return @rejected
30
+ end
31
+
32
+ def reject!(message='')
33
+ @rejected=true
34
+ @rejected_message=message
35
+ end
36
+
37
+ # :complete, :tmp_annotation, :error, :protein, :nucleotide, :alignment, :tcode
38
+ def get_annotations(annotation_type)
39
+ return @annotations.select{|a| a[:annotation_type]==annotation_type}
40
+ end
41
+
42
+ def annotate(annotation_type, message='', replace_existing = false)
43
+
44
+ if replace_existing
45
+ @annotations.reverse_each do |annotation|
46
+ if annotation[:annotation_type]==annotation_type
47
+ @annotations.delete(annotation)
48
+ end
49
+ end
50
+ end
51
+
52
+
53
+ @annotations.push({:annotation_type=>annotation_type,:message=>message})
54
+ end
55
+
56
+ def change_degenerated_nt!
57
+
58
+
59
+ ########################################
60
+
61
+ tranlaste_hash = {}
62
+ tranlaste_hash['R']= [['a','g'],0]
63
+ tranlaste_hash['W']= [['a','t'],0]
64
+ tranlaste_hash['M']= [['a','c'],0]
65
+ tranlaste_hash['K']= [['g','t'],0]
66
+ tranlaste_hash['S']= [['g','c'],0]
67
+ tranlaste_hash['Y']= [['c','t'],0]
68
+ tranlaste_hash['H']= [['a','t','c'],0]
69
+ tranlaste_hash['B']= [['g','t','c'],0]
70
+ tranlaste_hash['D']= [['g','a','t'],0]
71
+ tranlaste_hash['V']= [['g','a','c'],0]
72
+ tranlaste_hash['N']= [['g','a','c','t'],0]
73
+
74
+ ########################################
75
+
76
+ fix_degenerated_fasta!(tranlaste_hash)
77
+
78
+
79
+ end
80
+
81
+ def fix_degenerated_fasta!(tranlaste_hash)
82
+ s = @seq_fasta
83
+ res = []
84
+
85
+ nts_of_a_line = s.split('')
86
+
87
+ nts_of_a_line.map{
88
+ |e|
89
+ # puts "#{e} "
90
+
91
+ if (e =~ /[RWMKSYHBDVN]/)
92
+
93
+ # puts "#{e} "
94
+ tranlaste_hash[e][1] += 1
95
+ # puts "#{e} #{tranlaste_hash[e][1]}"
96
+
97
+ e = tranlaste_hash[e][0][tranlaste_hash[e][1]%tranlaste_hash[e][0].length]
98
+
99
+ # puts "#{e}"
100
+ end
101
+
102
+ res.push e
103
+
104
+ }
105
+
106
+ @seq_fasta=res.compact.join
107
+ # @seq_fasta='dario'
108
+ end
109
+
110
+
111
+ end
@@ -0,0 +1,877 @@
1
+
2
+ require 'common_functions'
3
+ require 'scbi_plot'
4
+
5
+ include CommonFunctions
6
+
7
+ class TestCode
8
+
9
+ def initialize(seq)
10
+
11
+ name=''
12
+ t_code=''
13
+ status=''
14
+ ref_start=0
15
+ ref_end=seq.seq_fasta.length
16
+ ref_frame=''
17
+ orf=''
18
+ protein = ''
19
+ p_long = 0
20
+
21
+ if (seq.seq_fasta.length < 200)
22
+ ref_name = seq.seq_name
23
+ ref_code = 0.0
24
+ ref_frame = 0
25
+ ref_status = 'unknown'
26
+ ref_orf = ''
27
+ ref_msgs = 'Sequence length < 200 nt'
28
+
29
+ seq.annotate(:tcode,"#{ref_name}\t#{seq.seq_fasta.length}\t\ttestcode\t#{ref_status}\t#{ref_code}\t\t\t\t\t#{ref_msgs}\t#{ref_frame}\t#{ref_start}\t#{ref_end}\t\t\t\t",true)
30
+
31
+ else
32
+
33
+ # para probar tescode con toda la secuencia, en lugar de con los ORFs ----------------------------------------------------------------------
34
+ # sense_strand = seq.seq_fasta.upcase
35
+ # antisense_strand = sense_strand.complementary_dna
36
+ # (t_code,t_status) = testCode_exec(sense_strand)
37
+ # ref_frame = 1
38
+ # ref_msgs = ''
39
+ # (as_t_code,as_t_status) = testCode_exec(antisense_strand)
40
+ # # puts "#{seq.seq_name}: t_code: #{t_code}, t_status: #{t_status}, as_t_code: #{as_t_code}, as_t_status: #{as_t_status}"
41
+ # seq.annotate(:tcode,"#{seq.seq_name}\t#{seq.seq_fasta.length}\t\ttestcode\t#{t_status}\t#{t_code}\t\t\t\t\t#{ref_msgs}\t#{ref_frame}\t#{ref_start}\t#{ref_end}\t\t\t\t",true)
42
+ # --------------------------------------------------------------------------------------------------------------------------------
43
+
44
+
45
+ # see add_region filter
46
+ (name,t_code,status,ref_start,ref_end,ref_frame,orf,ref_msgs,stop_before_start,more_than_one_frame) = t_code(seq)
47
+ seq.annotate(:tcode,"#{name}\t#{seq.seq_fasta.length}\t\ttestcode\t#{status}\t#{t_code}\t\t\t\t\t#{ref_msgs}\t#{ref_frame}\t#{ref_start}\t#{ref_end}\t\t\t\t",true)
48
+
49
+ # if (ref_msgs.nil?)
50
+ # ref_msgs = ''
51
+ # end
52
+ #
53
+ # if (stop_before_start)
54
+ # ref_msgs += "There is a STOP codon before ATG. "
55
+ # end
56
+ #
57
+ # if (more_than_one_frame)
58
+ # ref_msgs += "Possible frame error by an ins/del"
59
+ # end
60
+ #
61
+ # if (status.to_s =~ /^[CN]\_terminus/) || (status == :Internal)
62
+ # tmp_status = "Putative #{status.to_s}"
63
+ # else
64
+ # tmp_status = status.to_s
65
+ # end
66
+ #
67
+ #
68
+ # if (!orf.nil?) && (!more_than_one_frame)
69
+ # protein = orf.translate
70
+ # p_long = protein.length - 3
71
+ # seq.annotate(:tcode,"#{name}\t#{seq.seq_fasta.length}\t\ttestcode\t#{tmp_status}\t#{t_code}\t\t\t#{p_long}\t\t#{ref_msgs}\t#{ref_frame}\t#{ref_start}\t#{ref_end}\t\t\t\t#{protein}",true)
72
+ # else
73
+ # seq.annotate(:tcode,"#{name}\t#{seq.seq_fasta.length}\t\ttestcode\t#{tmp_status}\t#{t_code}\t\t\t\t\t#{ref_msgs}\t#{ref_frame}\t#{ref_start}\t#{ref_end}\t\t\t\t",true)
74
+ # end
75
+
76
+ end
77
+ end
78
+
79
+ def t_code(seq)
80
+
81
+ name = seq.seq_name
82
+ tc_fasta = seq.seq_fasta
83
+
84
+ # generamos todos los ORFs de cada secuencia
85
+ uncomplete_orf_finder(seq)
86
+ minus_strand = []
87
+ plus_strand = []
88
+
89
+ # puts "**************************************************"
90
+ # puts "#{name} #{tc_fasta.length}"
91
+ # ordenamos los ORFs empezando desde 5' a 3' y luego separamos los de cada hebra en un array distinto
92
+ seq.orfs.sort{|x,y| x.t_start <=> y.t_start }.each do |one_orf|
93
+
94
+ (t_code,t_status)=testCode_exec(one_orf.seq)
95
+ one_orf.status = t_status
96
+ one_orf.score = t_code
97
+
98
+ if (one_orf.frame < 0)
99
+ minus_strand.push one_orf
100
+ elsif (one_orf.frame > 0)
101
+ plus_strand.push one_orf
102
+ end
103
+
104
+ end
105
+ #----------------------------------- Plus strand
106
+ if (!plus_strand.empty?)
107
+ # puts "--------------Plus strand:"
108
+ best_plus_region = compare_regions(plus_strand,tc_fasta)
109
+ if (!best_plus_region.nil?)
110
+ # puts "#{best_plus_region.seq.length}, #{best_plus_region.status}, #{best_plus_region.type}, #{best_plus_region.t_start} - #{best_plus_region.t_end}, #{best_plus_region.frame}"
111
+ # puts best_plus_region.seq
112
+ end
113
+ end
114
+ #----------------------------------- Minus strand
115
+ if (!minus_strand.empty?)
116
+ # puts "--------------Minus strand:"
117
+ best_minus_region = compare_regions(minus_strand,tc_fasta)
118
+ if (!best_minus_region.nil?)
119
+ # puts "#{best_minus_region.seq.length}, #{best_minus_region.status}, #{best_minus_region.type}, #{best_minus_region.t_start} - #{best_minus_region.t_end}, #{best_minus_region.frame}"
120
+ # puts best_minus_region.seq
121
+ end
122
+ end
123
+ # obtenemos la region codificante mas larga de ambas hebras
124
+ best_region = nil
125
+ if (!best_plus_region.nil?)
126
+ if (!best_minus_region.nil?)
127
+ if (best_minus_region.seq.length > best_plus_region.seq.length)
128
+ best_region = best_minus_region
129
+ else
130
+ best_region = best_plus_region
131
+ end
132
+ else
133
+ best_region = best_plus_region
134
+ end
135
+ elsif (!best_minus_region.nil?)
136
+ best_region = best_minus_region
137
+ end
138
+ # puts "--------------Best region:"
139
+ # puts " --------------------------------- #{best_region.seq.length}, #{best_region.status}, #{best_region.type}, #{best_region.t_start} - #{best_region.t_end}, #{best_region.frame}"
140
+ # comprobamos el tipo de ORF segun si tiene un codon de parada antes del atg
141
+
142
+ if (!best_region.nil?) && (best_region.seq.length >= 200)
143
+ if (best_region.type == :Complete) && (!best_region.stop_codon)
144
+ best_region.type = 'Putative Complete'
145
+ ref_msgs = 'NO STOP codon before ATG. '
146
+ elsif (best_region.type == :N_terminus) && (!best_region.stop_codon)
147
+ ref_msgs = 'NO STOP codon before ATG. '
148
+ end
149
+ return [name, best_region.score, best_region.type, best_region.t_start, best_region.t_end, best_region.frame, best_region.seq, ref_msgs, best_region.stop_codon, best_region.more_than_one_frame]
150
+ else
151
+ ref_score = 0.0
152
+ ref_start = 0
153
+ ref_end = 0
154
+ ref_frame = 0
155
+ ref_orf = ''
156
+ ref_type = 'unknown'
157
+ ref_msgs = 'Non coding ORF found >= 200 nt '
158
+ return [name, ref_score, ref_type, ref_start, ref_end, ref_frame, ref_orf, ref_msgs, false, false]
159
+ end
160
+ end
161
+
162
+ # cuando se unen dos regiones de diferentes frames, nos dice de que tipo es la union, Complete, Internal, N-terminus...
163
+ def type_fusion(prev_type,one_type)
164
+ res_type = :Internal
165
+ if (prev_type == :C_terminus) #-------------- C-terminus
166
+ if (one_type == :N_terminus)
167
+ res_type = :Internal
168
+ elsif (one_type == :Internal)
169
+ res_type = :Internal
170
+ elsif (one_type == :Complete)
171
+ res_type = :C_terminus
172
+ end
173
+ elsif (prev_type == :N_terminus) #-------------- N-terminus
174
+ if (one_type == :C_terminus)
175
+ res_type = :Complete
176
+ elsif (one_type == :Internal)
177
+ res_type = :N_terminus
178
+ elsif (one_type == :Complete)
179
+ res_type = :Complete
180
+ end
181
+ elsif (prev_type == :Internal) #-------------- Internal
182
+ if (one_type == :C_terminus)
183
+ res_type = :C_terminus
184
+ elsif (one_type == :N_terminus)
185
+ res_type = :Internal
186
+ elsif (one_type == :Complete)
187
+ res_type = :C_terminus
188
+ end
189
+ elsif (prev_type == :Complete) #-------------- Complete
190
+ if (one_type == :C_terminus)
191
+ res_type = :Complete
192
+ elsif (one_type == :N_terminus)
193
+ res_type = :N_terminus
194
+ elsif (one_type == :Internal)
195
+ res_type = :N_terminus
196
+ end
197
+ end
198
+ return res_type
199
+ end
200
+
201
+ # para escoger la region codificante mas grande, incluso solapando verios frames
202
+ def compare_regions(this_strand,tc_fasta)
203
+
204
+ # select the largest ORF without frame repair-------------------------------- hay que comentar todo el bloque de arriba
205
+ best_orf = nil
206
+ best_orf = this_strand.sort{|x,y| y.seq.length <=> x.seq.length }[0]
207
+
208
+ if !best_orf.nil?
209
+ best_orf.type = best_orf.status
210
+ end
211
+
212
+ return best_orf
213
+
214
+ end
215
+
216
+ # to add regions over a determinated size
217
+ def add_region(orf_seq, orf_t_start, orf_t_end, orf_frame, orf_stop_codon, orf_type,seq)
218
+
219
+ if (orf_seq.length >= 200)
220
+ # puts "#{seq.seq_name}, #{orf_t_start} - #{orf_t_end}, #{orf_frame}, #{orf_type}"
221
+ seq.add_orf(orf_seq, orf_t_start, orf_t_end, orf_frame, orf_stop_codon, orf_type)
222
+ end
223
+
224
+ end
225
+
226
+ # se buscan las regiones Complete, N-terminus, C-terminus e Internal en la secuencia de cada frame
227
+ def generate_uncomplete_orf(a,frame,seq)
228
+
229
+ my_atg = true
230
+ atg_codon = false
231
+ stop_codon = false
232
+ any_stop = false
233
+
234
+ orf =''
235
+ t_start = 0
236
+ t_end = 0
237
+
238
+ a.each do |e|
239
+ t_end += 3
240
+ orf += e
241
+
242
+ if (e == 'ATG') && (!atg_codon)
243
+ atg_codon = true
244
+ stop_codon = false
245
+ t_start = t_end - 3
246
+ elsif (e == 'TAG') || (e == 'TGA') || (e == 'TAA')
247
+ orf_tmp = orf[t_start..t_end]
248
+ if (any_stop)
249
+ # case 1, complete orf
250
+ if (atg_codon)
251
+ orf_tmp = orf[t_start..t_end]
252
+ add_region(orf_tmp, t_start, t_end, frame, any_stop, :Complete,seq)
253
+ end
254
+ else
255
+ # case 2, C_terminus
256
+ if (my_atg)
257
+ orf_tmp = orf[0..t_end]
258
+ add_region(orf_tmp, 0, t_end, frame, any_stop, :C_terminus,seq)
259
+ # case 3, putative complete, complete without stop codon before atg
260
+ if (atg_codon)
261
+ orf_tmp = orf[t_start..t_end]
262
+ add_region(orf_tmp, t_start, t_end, frame, any_stop, :Complete, seq)
263
+ end
264
+ end
265
+ end
266
+
267
+ stop_codon = true
268
+ any_stop = true
269
+ my_atg = false
270
+ atg_codon = false
271
+ t_start += 3
272
+ end
273
+ end
274
+
275
+ # case 4, N_terminus and case 6, putative N_terminus
276
+ if (atg_codon) && (!stop_codon)
277
+ orf_tmp = orf[t_start..t_end]
278
+ add_region(orf_tmp, t_start, t_end, frame, any_stop, :N_terminus,seq)
279
+ end
280
+
281
+ # case 5, internal
282
+ if (my_atg) && (!any_stop)
283
+ orf_tmp = orf[0..t_end]
284
+ add_region(orf_tmp, 0, t_end, frame, any_stop, :Internal,seq)
285
+ end
286
+
287
+ end
288
+
289
+ # recorre cada uno de los frames y los pasa a generate_uncomplete_orf
290
+ def uncomplete_orf_finder(seq)
291
+
292
+ s = seq.seq_fasta.upcase
293
+ f1 = s.split('').each_slice(3).map{|e| e.join}
294
+ generate_uncomplete_orf(f1,1,seq)
295
+
296
+ s.sub!(/^./,'')
297
+ f2 = s.split('').each_slice(3).map{|e| e.join}
298
+ generate_uncomplete_orf(f2,2,seq)
299
+
300
+ s.sub!(/^./,'')
301
+ f3 = s.split('').each_slice(3).map{|e| e.join}
302
+ generate_uncomplete_orf(f3,3,seq)
303
+
304
+ # vamos a por los ORFs de la cadena complementaria
305
+ s = seq.seq_fasta.upcase
306
+ s = s.complementary_dna
307
+
308
+ f4 = s.split('').each_slice(3).map{|e| e.join}
309
+ generate_uncomplete_orf(f4,-1,seq)
310
+
311
+ s.sub!(/^./,'')
312
+ f5 = s.split('').each_slice(3).map{|e| e.join}
313
+ generate_uncomplete_orf(f5,-2,seq)
314
+
315
+ s.sub!(/^./,'')
316
+ f6 = s.split('').each_slice(3).map{|e| e.join}
317
+ generate_uncomplete_orf(f6,-3,seq)
318
+
319
+ end
320
+
321
+
322
+
323
+
324
+
325
+
326
+
327
+
328
+ # | = ATG
329
+ # * = STOP
330
+ #
331
+ # -------*---|>>>>>>>>>>>>>> 1 >>>>>>>>>>>>*-------------
332
+ # >>>>>>>>>> 2 >>>>>>>>>>>>>*----------------------------
333
+ # ---|>>>>>>>> 3 >>>>>>>>>>>>*---------------------------
334
+ # -------------------*-----------------|>>>>>>>>>> 4 >>>>
335
+ # >>>>>>>>>>>>>>>>>>>>>>>>>> 5 >>>>>>>>>>>>>>>>>>>>>>>>>>
336
+ #--------------------------- 6 ----------|>>>>>>>>>>>>>>>
337
+ #
338
+ # 1 complete orf, with stop codon before atg
339
+ # 2 C-terminus
340
+ # 3 putative complete, complete without stop codon before atg
341
+ # 4 N_terminus, with stop codon before atg
342
+ # 5 internal
343
+ # 6 putative N_terminus, N_terminus without stop codon before atg
344
+
345
+ # no se usa
346
+ def orf_fusion(orfs_array,tc_fasta,name,ref_status,ref_code,ref_start,ref_end,ref_frame,ref_orf,ref_msgs)
347
+
348
+ lower_start = 9999
349
+ higher_end = 0
350
+ new_orf = false
351
+ ref_name = name
352
+
353
+ # vamos a coger el mejor de referencia (el coding mas largo)
354
+ # y vamos a poner en el warning los coding que estan en el mismo sentido y no estan contenidos en el mejor
355
+ orfs_array.sort! {|orf1,orf2| orf1[1] <=> orf2[1]}
356
+
357
+ tmp_orf = ref_orf
358
+ tmp_start = ref_start
359
+ tmp_end = ref_end
360
+ tmp_frame = ref_frame
361
+ tmp_msg = ''
362
+
363
+ # puts "\n\n#{name} ---- tmp_frame: #{tmp_frame} ------------------------ \n\n"
364
+
365
+ orfs_array.each do |orf|
366
+
367
+ (orf[1],orf[2]) = corrige_frame(orf[3],orf[1],orf[2])
368
+ # (orf[1],orf[2]) = $an.corrige_frame(orf[3],orf[1],orf[2])
369
+
370
+ # puts "*** name: #{name}, orf[1]: #{orf[1]}, orf[2]: #{orf[2]}, orf[3]: #{orf[3]} ref_start: #{ref_start}, ref_end: #{ref_end}, ref_frame: #{ref_frame} ***\n\n"
371
+
372
+ if (orf[0] != tmp_orf)
373
+
374
+ if ((tmp_end >= orf[1]) && (tmp_end <= orf[2])) || ((tmp_start >= orf[1]) && (tmp_start <= orf[2])) # los ORFs solapan
375
+
376
+ # puts "SOLAPAN frame: #{orf[3]} tmp_start: #{tmp_start} tmp_end: #{tmp_end} orf_start: #{orf[1]} orf_end: #{orf[2]}"
377
+ if (tmp_frame > 0)
378
+ tmp_msg = ", overlapping coding region (#{orf[1]},#{orf[2]})"
379
+ elsif (tmp_frame < 0)
380
+ tmp_msg = ", overlapping coding region (-#{orf[1]},-#{orf[2]})"
381
+ end
382
+ new_orf = true
383
+
384
+ elsif (tmp_end < orf[1]) || (tmp_start > orf[2]) # los ORFs estan separados
385
+
386
+ # puts "#{name} frame: #{orf[3]} SEPARADOS --> tmp_start: #{tmp_start} tmp_end: #{tmp_end} orf_start: #{orf[1]} orf_end: #{orf[2]}"
387
+ if (tmp_frame > 0)
388
+ tmp_msg = ", other coding region (#{orf[1]},#{orf[2]})"
389
+ elsif (tmp_frame < 0)
390
+ tmp_msg = ", other coding region (-#{orf[1]},-#{orf[2]})"
391
+ end
392
+ new_orf = true
393
+
394
+ end
395
+
396
+ if (new_orf == true)
397
+
398
+ if (orf[1] < lower_start)
399
+ lower_start = orf[1]
400
+ end
401
+
402
+ # if (orf[2] > higher_end)
403
+ # end
404
+
405
+ (tmp_code,tmp_status)=testCode_exec(tc_fasta[lower_start-1..higher_end-1])
406
+
407
+ if (tmp_status != 'unknown')
408
+ # puts "#{name} ----------------------------------- FUSION!!!!!!!!!!!!!!!!\n\n"
409
+ # tenemos varios ORFs q son codificantes al unirlos
410
+ ref_msgs += tmp_msg
411
+ new_orf = false
412
+ else
413
+ # puts "#{name} ------------------------------------NO se unen\n\n"
414
+ end
415
+ end
416
+ end
417
+ end
418
+ return[ref_status,ref_code,ref_name,ref_start,ref_end,ref_frame,ref_orf,ref_msgs]
419
+ end
420
+ # no se usa
421
+ def t_code_old(seq)
422
+
423
+ ref_code = 0.0
424
+ ref_name = ''
425
+ ref_start = 0
426
+ ref_end = 0
427
+ ref_frame = 0
428
+ ref_status = ''
429
+ ref_orf = ''
430
+ ref_msgs = ''
431
+
432
+ name = seq.seq_name
433
+ tc_fasta = seq.seq_fasta
434
+
435
+ # generamos todos los ORFs mayores de 200pb de cada secuencia, en fl_string_utils
436
+ orfs_array = tc_fasta.orf_finder
437
+
438
+ if (orfs_array[0].nil?)
439
+ ref_name = name
440
+ ref_code = 0.0
441
+ ref_start = 0
442
+ ref_end = 0
443
+ ref_frame = 0
444
+ ref_status = 'unknown'
445
+ ref_orf = ''
446
+ ref_msgs = 'ORF length < 200 nt'
447
+ # ref_msgs = 'Your sequence has not an ORF longer than 200 nt'
448
+
449
+ else
450
+ one_good_orf_minus = false
451
+ more_than_one_minus = false
452
+ one_good_orf_plus = false
453
+ more_than_one_plus = false
454
+
455
+ good_orfs_minus = []
456
+ good_orfs_plus = []
457
+
458
+ one_coding = false
459
+ one_putative = false
460
+
461
+ # orfs_array.sort! {|orf1,orf2| (orf1[2] - orf1[1]) <=> (orf2[2] - orf2[1])}
462
+ orfs_array.each do |orf|
463
+
464
+ # long = orf.length - 1
465
+ if (orf[0])
466
+ # if (long >= 200)
467
+ (t_code,t_status)=testCode_exec(orf[0])
468
+ # puts "name: #{name},t_status: #{t_status}, t_code: #{t_code}, stop_codon: #{orf[4]}, length: #{tc_fasta.length}, start: #{orf[1]}, end: #{orf[2]}, frame: #{orf[3]}\n\n"
469
+
470
+ if (t_status != 'unknown')
471
+ if (orf[3].to_i < 0)
472
+ if (one_good_orf_minus == true)
473
+ more_than_one_minus = true
474
+ orf.push t_code
475
+ orf.push t_status
476
+ good_orfs_minus.push orf
477
+ else
478
+ one_good_orf_minus = true
479
+ orf.push t_code #orf[5]
480
+ orf.push t_status #orf[6]
481
+ good_orfs_minus.push orf
482
+ end
483
+ elsif (orf[3].to_i > 0)
484
+ if (one_good_orf_plus == true)
485
+ more_than_one_plus = true
486
+ orf.push t_code
487
+ orf.push t_status
488
+ good_orfs_plus.push orf
489
+ else
490
+ one_good_orf_plus = true
491
+ orf.push t_code #orf[5]
492
+ orf.push t_status #orf[6]
493
+ good_orfs_plus.push orf
494
+ end
495
+ end
496
+ end
497
+
498
+ # if (t_code.to_f > ref_code) # cogemos el de mejor testcode
499
+ # puts "name: #{name}, orf[0].length: #{orf[0].length}, ref_orf.length: #{ref_orf.length}, t_status: #{t_status}, one_coding: #{one_coding}\n\n"
500
+ if (orf[0].length > ref_orf.length) and (t_status == 'coding') # cogemos el mayor de los coding
501
+ # puts "compleeeeeeeeeeeeeeetttttttttttaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"
502
+ ref_code = t_code.to_f
503
+ ref_name = name
504
+ ref_orf = orf[0]
505
+ ref_start = orf[1]
506
+ ref_end = orf[2]
507
+ ref_frame = orf[3]
508
+
509
+ one_coding = true
510
+ (ref_status,ref_msgs) = checking_beginning(orf[4],t_status,orf[1],tc_fasta,orf[2])
511
+
512
+ elsif (one_coding == false) and (orf[0].length > ref_orf.length) and (t_status == 'putative_coding')
513
+ # puts "putaaaaaaaaaaaaaaaaaaativeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeee"
514
+ ref_code = t_code.to_f
515
+ ref_name = name
516
+ ref_orf = orf[0]
517
+ ref_start = orf[1]
518
+ ref_end = orf[2]
519
+ ref_frame = orf[3]
520
+
521
+ one_putative = true
522
+ (ref_status,ref_msgs) = checking_beginning(orf[4],t_status,orf[1],tc_fasta,orf[2])
523
+
524
+ elsif (one_coding == false) and (one_putative == false)
525
+ # puts "unknoooooooooooooooooooooooooooooooooooooooooooooooooooooooooooown"
526
+ ref_status = t_status
527
+ ref_code = t_code.to_f
528
+ ref_name = name
529
+ ref_orf = orf[0]
530
+ ref_start = orf[1]
531
+ ref_end = orf[2]
532
+ ref_frame = orf[3]
533
+
534
+ ref_msgs = 'Bad testcode score'
535
+ # ref_msgs = 'test code did not find a coding region in your sequence'
536
+ end
537
+ # puts "**name: #{name},ref_status: #{ref_status}, ref_code: #{ref_code}, length: #{tc_fasta.length}, ref_start: #{ref_start}, ref_end: #{ref_end}, ref_frame: #{ref_frame}\n\n"
538
+ # puts "t_code: #{t_code}, t_status: #{t_status}"
539
+ end
540
+ end
541
+
542
+ # vamos a preparar el mejor ORF y devolverlo
543
+ (ref_start,ref_end) = corrige_frame(ref_frame,ref_start,ref_end)
544
+ # puts "**name: #{name}, ref_start: #{ref_start}, ref_end: #{ref_end}, ref_frame: #{ref_frame}\n\n"
545
+
546
+ # si encontramos más de un orf valido
547
+ if (more_than_one_plus) or (more_than_one_minus)
548
+ if (ref_frame > 0)
549
+ if (more_than_one_plus)
550
+ (ref_status,ref_code,ref_name,ref_start,ref_end,ref_frame,ref_orf,ref_msgs) = orf_fusion(good_orfs_plus,tc_fasta,name,ref_status,ref_code,ref_start,ref_end,ref_frame,ref_orf,ref_msgs)
551
+ # if (ref_frame == 0)
552
+ # ref_frame = 7
553
+ # end
554
+ # puts "merging ORFs with orf_fusion!!!!"
555
+ end
556
+ elsif (ref_frame < 0)
557
+ if (more_than_one_minus)
558
+ (ref_status,ref_code,ref_name,ref_start,ref_end,ref_frame,ref_orf,ref_msgs) = orf_fusion(good_orfs_minus,tc_fasta,name,ref_status,ref_code,ref_start,ref_end,ref_frame,ref_orf,ref_msgs)
559
+ # if (ref_frame == 0)
560
+ # ref_frame = -7
561
+ # end
562
+ # puts "merging ORFs with orf_fusion!!!!"
563
+ end
564
+ end
565
+ end
566
+
567
+ # puts "name: #{name},t_status: #{ref_status}, t_code: #{ref_code}, frame: #{ref_frame},start: #{ref_start}, end: #{ref_end}\n\n"
568
+
569
+ if (ref_frame < 0)
570
+ (kk1,kk2,ref_start2,ref_end2) = reverse_seq(tc_fasta,ref_frame,ref_start,ref_end)
571
+
572
+ ref_start_ok = "#{ref_start2} (-#{ref_start})"
573
+ ref_end_ok = "#{ref_end2} (-#{ref_end})"
574
+ else
575
+ ref_start_ok = ref_start
576
+ ref_end_ok = ref_end
577
+ end
578
+ end
579
+
580
+ return [ref_name,ref_code,ref_status,ref_start_ok,ref_end_ok,ref_frame,ref_orf,ref_msgs]
581
+
582
+ end
583
+ # no se usa
584
+ # gnuplot must be installed
585
+ def window_walking(seq)
586
+
587
+ tcode_array = []
588
+ y1=[]
589
+ y2=[]
590
+
591
+ s = seq.seq_fasta.upcase
592
+ (s.length-200).times do |i|
593
+ (t_code,t_status)=testCode_exec(s[i..i+199])
594
+ y1.push t_code
595
+ end
596
+
597
+ tcode_array_rc = []
598
+
599
+ src = s.complementary_dna
600
+ (src.length-200).times do |i|
601
+ (t_code,t_status)=testCode_exec(src[i..i+199])
602
+ # puts "#{i}-#{i+199}"
603
+ # puts t_status
604
+ # puts t_code
605
+ puts src[i..i+199]
606
+ y2.push t_code
607
+ end
608
+
609
+ # Create lines plot
610
+ p=ScbiPlot::Lines.new('lines.png','title')
611
+ x=(1..src.length-200).entries
612
+ p.add_x(x)
613
+
614
+ # puts "x_length: #{src.length-200} #{x}"
615
+ # puts "x: #{x.length}, y: #{y1.length}"
616
+ p.add_series('serie0', y1)
617
+
618
+ p.do_graph
619
+
620
+ end
621
+ # no se usa
622
+ # para escoger la region codificante mas grande, incluso solapando verios frames
623
+ def compare_regions_old(this_strand,tc_fasta)
624
+
625
+ # array_regions = []
626
+ # array_regions.push this_strand.first.dup
627
+ #
628
+ # this_strand.each do |one_orf|
629
+ #
630
+ # prev_orf = array_regions.last.dup # taking last orf before add actual orf
631
+ # array_regions.push one_orf.dup # add actual orf, el primer orf me va a salir duplicado
632
+ #
633
+ # if prev_orf.overlaps?(one_orf)
634
+ # prev_orf.seq = tc_fasta[prev_orf.t_start..one_orf.t_end]
635
+ #
636
+ # (t_code,t_status)=testCode_exec(prev_orf.seq)
637
+ #
638
+ # prev_orf.score = t_code
639
+ # prev_orf.status = t_status
640
+ #
641
+ # #t_start, frame and stop_codon are the same
642
+ # prev_orf.t_end = one_orf.t_end
643
+ # if (prev_orf.type != one_orf.type)
644
+ # prev_orf.type = type_fusion(prev_orf.type,one_orf.type)
645
+ # end
646
+ # prev_orf.more_than_one_frame = true
647
+ #
648
+ # # puts "overlaps:"
649
+ # # puts "#{one_orf.t_start} - #{one_orf.t_end}"
650
+ # # puts "#{array_regions.last.t_start} - #{array_regions.last.t_end}"
651
+ #
652
+ # array_regions.push prev_orf # add overlapped orf,
653
+ # end
654
+ #
655
+ # end
656
+ #
657
+ # # select the largest ORF without frame repair-------------------------------- hay que comentar todo el bloque de arriba
658
+ # # best_orf = nil
659
+ # # best_orf = this_strand.sort{|x,y| y.seq.length <=> x.seq.length }[0]
660
+ # # ---------------------------------------------------------------------------
661
+ #
662
+ # # select the largest ORF ----------------------------------------------------
663
+ # # best_orf = nil
664
+ # # best_orf = array_regions.sort{|x,y| y.seq.length <=> x.seq.length }[0]
665
+ # # if !best_orf.nil?
666
+ # # puts "best_orf.status: #{best_orf.status}, best_orf.type: #{best_orf.type}"
667
+ # # end
668
+ #
669
+ #
670
+ # # ---------------------------------------------------------------------------
671
+ #
672
+ # # select the largest coding ORF ---------------------------------------------
673
+ # best_orf = nil
674
+ # array_regions.sort{|x,y| y.seq.length <=> x.seq.length }.each do |one_orf|
675
+ # # puts "#{one_orf.status}, #{one_orf.type}"
676
+ # if (one_orf.status == :coding)
677
+ # best_orf = one_orf
678
+ # break
679
+ # end
680
+ # end
681
+ # # ---------------------------------------------------------------------------
682
+ #
683
+ # # puts "\n\n\nstart"
684
+ # # array_regions.sort{|x,y| y.seq.length <=> x.seq.length }.each do |one_orf|
685
+ # # puts one_orf.seq.length
686
+ # # end
687
+ # # puts "end\n\n\n"
688
+ # # puts "best_orf: #{best_orf.seq.length}"
689
+ #
690
+ # # if !best_orf.nil?
691
+ # # best_orf.type = best_orf.status
692
+ # # end
693
+ #
694
+ # return best_orf
695
+
696
+ end
697
+
698
+ ###### test code program functions #########
699
+
700
+ def testCode_exec (sequence)
701
+
702
+ sequence.downcase!
703
+ basesOne = [0,0,0,0];
704
+ basesTwo = [0,0,0,0];
705
+ basesThree = [0,0,0,0];
706
+
707
+ #for (j = 0; j < sequence.length; j = j + 3)
708
+
709
+ 0.step(sequence.length-1,3) do |j|
710
+ if (sequence[j].chr == "g")
711
+ basesOne[0] = basesOne[0] + 1;
712
+ elsif (sequence[j].chr == "a")
713
+ basesOne[1] = basesOne[1] + 1;
714
+ elsif (sequence[j].chr == "t")
715
+ basesOne[2] = basesOne[2] + 1;
716
+ elsif (sequence[j].chr == "c")
717
+ basesOne[3] = basesOne[3] + 1;
718
+ else
719
+ end
720
+ end
721
+
722
+ #for (j = 1; j < sequence.length; j = j + 3)
723
+ 1.step(sequence.length-1,3) do |j|
724
+ if (sequence[j].chr == "g")
725
+ basesTwo[0] = basesTwo[0] + 1;
726
+ elsif (sequence[j].chr == "a")
727
+ basesTwo[1] = basesTwo[1] + 1;
728
+ elsif (sequence[j].chr == "t")
729
+ basesTwo[2] = basesTwo[2] + 1;
730
+ elsif (sequence[j].chr == "c")
731
+ basesTwo[3] = basesTwo[3] + 1;
732
+ else
733
+ end
734
+ end
735
+
736
+
737
+ #for (j = 2; j < sequence.length; j = j + 3)
738
+ 2.step(sequence.length-1,3) do |j|
739
+ if (sequence[j].chr == "g")
740
+ basesThree[0] = basesThree[0] + 1;
741
+ elsif (sequence[j].chr == "a")
742
+ basesThree[1] = basesThree[1] + 1;
743
+ elsif (sequence[j].chr == "t")
744
+ basesThree[2] = basesThree[2] + 1;
745
+ elsif (sequence[j].chr == "c")
746
+ basesThree[3] = basesThree[3] + 1;
747
+ else
748
+ end
749
+ end
750
+
751
+ paramG = calcParam(basesOne[0],basesTwo[0],basesThree[0]);
752
+ contentG = countBases(basesOne[0],basesTwo[0],basesThree[0]) / sequence.length.to_f;
753
+ posProbG = usePosParam(paramG,"g");
754
+ contProbG = useContParam(contentG,"g");
755
+ paramA = calcParam(basesOne[1],basesTwo[1],basesThree[1]);
756
+ contentA = countBases(basesOne[1],basesTwo[1],basesThree[1]) / sequence.length.to_f;
757
+ posProbA = usePosParam(paramA,"a");
758
+ contProbA = useContParam(contentA,"a");
759
+ paramT = calcParam(basesOne[2],basesTwo[2],basesThree[2]);
760
+ contentT = countBases(basesOne[2],basesTwo[2],basesThree[2]) / sequence.length.to_f;
761
+ posProbT = usePosParam(paramT,"t");
762
+ contProbT = useContParam(contentT,"t");
763
+ paramC = calcParam(basesOne[3],basesTwo[3],basesThree[3]);
764
+ contentC = countBases(basesOne[3],basesTwo[3],basesThree[3]) / sequence.length.to_f;
765
+ posProbC = usePosParam(paramC,"c");
766
+ contProbC = useContParam(contentC,"c");
767
+ valueY = posProbG * 0.31 + contProbG * 0.15 + posProbA * 0.26 + contProbA * 0.11 + posProbT * 0.33 + contProbT * 0.14 + posProbC * 0.18 + contProbC * 0.12;
768
+ valueY = ((valueY*1000.0).round/1000.0);
769
+
770
+ # return 'The TestCode value is <b>' + valueY.to_s + '</b>, which indicates that the sequence ' + getConclusion(valueY) + '.';
771
+ return [valueY.to_s, getConclusion(valueY)]
772
+ end
773
+
774
+ def calcParam (valueOne,valueTwo,valueThree)
775
+ paramArray = [valueOne,valueTwo,valueThree];
776
+ paramArray = paramArray.sort#{|a,b| return a-b}#(compareNumbers);
777
+ paramValue = paramArray[2] / (paramArray[0] + 1.0);
778
+ # puts paramArray.to_json
779
+ return paramValue;
780
+ end
781
+
782
+ def countBases (valueOne,valueTwo,valueThree)
783
+ return valueOne + valueTwo + valueThree;
784
+ end
785
+
786
+ def usePosParam (paramValue,base)
787
+ arrayOfCodingProb = [];
788
+ codeProb = 0;
789
+ if (base == "g")
790
+ arrayOfCodingProb = [0.08,0.08,0.16,0.27,0.48,0.53,0.64,0.74,0.88,0.90]
791
+ elsif (base == "a")
792
+ arrayOfCodingProb = [0.22,0.20,0.34,0.45,0.68,0.58,0.93,0.84,0.68,0.94]
793
+ elsif (base == "t")
794
+ arrayOfCodingProb = [0.09,0.09,0.20,0.54,0.44,0.69,0.68,0.91,0.97,0.97]
795
+ elsif (base == "c")
796
+ arrayOfCodingProb = [0.23,0.30,0.33,0.51,0.48,0.66,0.81,0.70,0.70,0.80]
797
+ end
798
+
799
+
800
+ if (paramValue >= 0 and paramValue < 1.1)
801
+ codeProb = arrayOfCodingProb[0];
802
+ elsif (paramValue >=1.1 and paramValue < 1.2)
803
+ codeProb = arrayOfCodingProb[1];
804
+ elsif (paramValue >=1.2 and paramValue < 1.3)
805
+ codeProb = arrayOfCodingProb[2];
806
+ elsif (paramValue >=1.3 and paramValue < 1.4)
807
+ codeProb = arrayOfCodingProb[3];
808
+ elsif (paramValue >=1.4 and paramValue < 1.5)
809
+ codeProb = arrayOfCodingProb[4];
810
+ elsif (paramValue >=1.5 and paramValue < 1.6)
811
+ codeProb = arrayOfCodingProb[5];
812
+ elsif (paramValue >=1.6 and paramValue < 1.7)
813
+ codeProb = arrayOfCodingProb[6];
814
+ elsif (paramValue >=1.7 and paramValue < 1.8)
815
+ codeProb = arrayOfCodingProb[7];
816
+ elsif (paramValue >=1.8 and paramValue < 1.9)
817
+ codeProb = arrayOfCodingProb[8];
818
+ elsif (paramValue >=1.9)
819
+ codeProb = arrayOfCodingProb[9];
820
+ end
821
+
822
+ return codeProb;
823
+ end
824
+
825
+ def useContParam (paramValue,base)
826
+ arrayOfCodingProb = [];
827
+ codeProb = 0;
828
+ if (base == "g")
829
+ arrayOfCodingProb = [0.29,0.33,0.41,0.41,0.73,0.64,0.64,0.47,0.54,0.40]
830
+ elsif (base == "a")
831
+ arrayOfCodingProb = [0.21,0.81,0.65,0.67,0.49,0.62,0.55,0.44,0.49,0.28]
832
+ elsif (base == "t")
833
+ arrayOfCodingProb = [0.58,0.51,0.69,0.56,0.75,0.55,0.40,0.39,0.24,0.28]
834
+ elsif (base == "c")
835
+ arrayOfCodingProb = [0.31,0.39,0.44,0.43,0.59,0.59,0.64,0.51,0.64,0.82]
836
+ end
837
+
838
+ if (paramValue >= 0 and paramValue < 0.17)
839
+ codeProb = arrayOfCodingProb[0];
840
+ elsif (paramValue >=0.17 and paramValue < 0.19)
841
+ codeProb = arrayOfCodingProb[1];
842
+ elsif (paramValue >=0.19 and paramValue < 0.21)
843
+ codeProb = arrayOfCodingProb[2];
844
+ elsif (paramValue >=0.21 and paramValue < 0.23)
845
+ codeProb = arrayOfCodingProb[3];
846
+ elsif (paramValue >=0.23 and paramValue < 0.25)
847
+ codeProb = arrayOfCodingProb[4];
848
+ elsif (paramValue >=0.25 and paramValue < 0.27)
849
+ codeProb = arrayOfCodingProb[5];
850
+ elsif (paramValue >=0.27 and paramValue < 0.29)
851
+ codeProb = arrayOfCodingProb[6];
852
+ elsif (paramValue >=0.29 and paramValue < 0.31)
853
+ codeProb = arrayOfCodingProb[7];
854
+ elsif (paramValue >=0.31 and paramValue < 0.33)
855
+ codeProb = arrayOfCodingProb[8];
856
+ elsif (paramValue >=0.33)
857
+ codeProb = arrayOfCodingProb[9];
858
+ end
859
+
860
+ return codeProb;
861
+ end
862
+
863
+ def getConclusion (testCode_value)
864
+ codeProb = "";
865
+ if (testCode_value < 0.74)
866
+ codeProb = :unknown;
867
+ elsif (testCode_value >=0.74 and testCode_value < 0.95)
868
+ codeProb = :putative_coding;
869
+ elsif (testCode_value >=0.95)
870
+ codeProb = :coding;
871
+ end
872
+
873
+ return codeProb;
874
+ end
875
+
876
+
877
+ end