full_lengther_next 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- data/History.txt +4 -0
- data/Manifest.txt +27 -0
- data/PostInstall.txt +6 -0
- data/README.rdoc +147 -0
- data/Rakefile +37 -0
- data/bin/download_fln_dbs.rb +197 -0
- data/bin/full_lengther_next +173 -0
- data/bin/make_user_db.rb +144 -0
- data/lib/full_lengther_next.rb +13 -0
- data/lib/full_lengther_next/classes/common_functions.rb +94 -0
- data/lib/full_lengther_next/classes/fl2_stats.rb +222 -0
- data/lib/full_lengther_next/classes/fl_analysis.rb +688 -0
- data/lib/full_lengther_next/classes/fl_string_utils.rb +139 -0
- data/lib/full_lengther_next/classes/lcs.rb +33 -0
- data/lib/full_lengther_next/classes/my_worker.rb +122 -0
- data/lib/full_lengther_next/classes/my_worker_manager.rb +167 -0
- data/lib/full_lengther_next/classes/orf.rb +32 -0
- data/lib/full_lengther_next/classes/sequence.rb +111 -0
- data/lib/full_lengther_next/classes/test_code.rb +877 -0
- data/lib/full_lengther_next/classes/une_los_hit.rb +287 -0
- data/script/console +10 -0
- data/script/destroy +14 -0
- data/script/generate +14 -0
- data/test/test_full_lengther_next.rb +11 -0
- data/test/test_helper.rb +3 -0
- metadata +150 -0
@@ -0,0 +1,32 @@
|
|
1
|
+
|
2
|
+
class Orf
|
3
|
+
|
4
|
+
attr_accessor :seq,:t_start,:t_end,:frame,:stop_codon,:type,:status,:score, :more_than_one_frame
|
5
|
+
|
6
|
+
def initialize(orf_seq, t_start, t_end, frame, stop_codon, type)
|
7
|
+
@seq=orf_seq
|
8
|
+
@t_start=t_start
|
9
|
+
@t_end=t_end
|
10
|
+
@frame=frame
|
11
|
+
@stop_codon=stop_codon
|
12
|
+
@type=type # :N_terminus,:C_terminus,:Complete,:Internal,:Putative_Complete,:Putative_N_terminus
|
13
|
+
@status = :unknown # :unknown,:putative_coding,:coding
|
14
|
+
@score = 0
|
15
|
+
@more_than_one_frame = false
|
16
|
+
end
|
17
|
+
|
18
|
+
def overlaps?(other_orf)
|
19
|
+
overlap_status = false
|
20
|
+
i1 = self.t_start
|
21
|
+
i2 = other_orf.t_start
|
22
|
+
e1 = self.t_end
|
23
|
+
e2 = other_orf.t_end
|
24
|
+
|
25
|
+
if (e1 > i2) && (e1 < e2)
|
26
|
+
overlap_status = true
|
27
|
+
end
|
28
|
+
|
29
|
+
return overlap_status
|
30
|
+
end
|
31
|
+
|
32
|
+
end
|
@@ -0,0 +1,111 @@
|
|
1
|
+
|
2
|
+
require 'orf'
|
3
|
+
|
4
|
+
class Sequence
|
5
|
+
|
6
|
+
attr_accessor :seq_name,:seq_fasta,:seq_qual,:orfs,:sec_desc
|
7
|
+
|
8
|
+
def initialize(seq_name,seq_fasta,seq_qual='')
|
9
|
+
@seq_name=seq_name
|
10
|
+
@seq_fasta = seq_fasta
|
11
|
+
change_degenerated_nt!
|
12
|
+
@seq_qual = ''
|
13
|
+
@sec_desc = ''
|
14
|
+
@annotations=[]
|
15
|
+
@orfs=[]
|
16
|
+
|
17
|
+
@rejected=false
|
18
|
+
@rejected_message=''
|
19
|
+
|
20
|
+
end
|
21
|
+
|
22
|
+
def add_orf(orf_seq, orf_t_start, orf_t_end, orf_frame, orf_stop_codon, orf_type)
|
23
|
+
orf = Orf.new(orf_seq, orf_t_start, orf_t_end, orf_frame, orf_stop_codon, orf_type)
|
24
|
+
@orfs.push orf
|
25
|
+
|
26
|
+
end
|
27
|
+
|
28
|
+
def rejected?
|
29
|
+
return @rejected
|
30
|
+
end
|
31
|
+
|
32
|
+
def reject!(message='')
|
33
|
+
@rejected=true
|
34
|
+
@rejected_message=message
|
35
|
+
end
|
36
|
+
|
37
|
+
# :complete, :tmp_annotation, :error, :protein, :nucleotide, :alignment, :tcode
|
38
|
+
def get_annotations(annotation_type)
|
39
|
+
return @annotations.select{|a| a[:annotation_type]==annotation_type}
|
40
|
+
end
|
41
|
+
|
42
|
+
def annotate(annotation_type, message='', replace_existing = false)
|
43
|
+
|
44
|
+
if replace_existing
|
45
|
+
@annotations.reverse_each do |annotation|
|
46
|
+
if annotation[:annotation_type]==annotation_type
|
47
|
+
@annotations.delete(annotation)
|
48
|
+
end
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
|
53
|
+
@annotations.push({:annotation_type=>annotation_type,:message=>message})
|
54
|
+
end
|
55
|
+
|
56
|
+
def change_degenerated_nt!
|
57
|
+
|
58
|
+
|
59
|
+
########################################
|
60
|
+
|
61
|
+
tranlaste_hash = {}
|
62
|
+
tranlaste_hash['R']= [['a','g'],0]
|
63
|
+
tranlaste_hash['W']= [['a','t'],0]
|
64
|
+
tranlaste_hash['M']= [['a','c'],0]
|
65
|
+
tranlaste_hash['K']= [['g','t'],0]
|
66
|
+
tranlaste_hash['S']= [['g','c'],0]
|
67
|
+
tranlaste_hash['Y']= [['c','t'],0]
|
68
|
+
tranlaste_hash['H']= [['a','t','c'],0]
|
69
|
+
tranlaste_hash['B']= [['g','t','c'],0]
|
70
|
+
tranlaste_hash['D']= [['g','a','t'],0]
|
71
|
+
tranlaste_hash['V']= [['g','a','c'],0]
|
72
|
+
tranlaste_hash['N']= [['g','a','c','t'],0]
|
73
|
+
|
74
|
+
########################################
|
75
|
+
|
76
|
+
fix_degenerated_fasta!(tranlaste_hash)
|
77
|
+
|
78
|
+
|
79
|
+
end
|
80
|
+
|
81
|
+
def fix_degenerated_fasta!(tranlaste_hash)
|
82
|
+
s = @seq_fasta
|
83
|
+
res = []
|
84
|
+
|
85
|
+
nts_of_a_line = s.split('')
|
86
|
+
|
87
|
+
nts_of_a_line.map{
|
88
|
+
|e|
|
89
|
+
# puts "#{e} "
|
90
|
+
|
91
|
+
if (e =~ /[RWMKSYHBDVN]/)
|
92
|
+
|
93
|
+
# puts "#{e} "
|
94
|
+
tranlaste_hash[e][1] += 1
|
95
|
+
# puts "#{e} #{tranlaste_hash[e][1]}"
|
96
|
+
|
97
|
+
e = tranlaste_hash[e][0][tranlaste_hash[e][1]%tranlaste_hash[e][0].length]
|
98
|
+
|
99
|
+
# puts "#{e}"
|
100
|
+
end
|
101
|
+
|
102
|
+
res.push e
|
103
|
+
|
104
|
+
}
|
105
|
+
|
106
|
+
@seq_fasta=res.compact.join
|
107
|
+
# @seq_fasta='dario'
|
108
|
+
end
|
109
|
+
|
110
|
+
|
111
|
+
end
|
@@ -0,0 +1,877 @@
|
|
1
|
+
|
2
|
+
require 'common_functions'
|
3
|
+
require 'scbi_plot'
|
4
|
+
|
5
|
+
include CommonFunctions
|
6
|
+
|
7
|
+
class TestCode
|
8
|
+
|
9
|
+
def initialize(seq)
|
10
|
+
|
11
|
+
name=''
|
12
|
+
t_code=''
|
13
|
+
status=''
|
14
|
+
ref_start=0
|
15
|
+
ref_end=seq.seq_fasta.length
|
16
|
+
ref_frame=''
|
17
|
+
orf=''
|
18
|
+
protein = ''
|
19
|
+
p_long = 0
|
20
|
+
|
21
|
+
if (seq.seq_fasta.length < 200)
|
22
|
+
ref_name = seq.seq_name
|
23
|
+
ref_code = 0.0
|
24
|
+
ref_frame = 0
|
25
|
+
ref_status = 'unknown'
|
26
|
+
ref_orf = ''
|
27
|
+
ref_msgs = 'Sequence length < 200 nt'
|
28
|
+
|
29
|
+
seq.annotate(:tcode,"#{ref_name}\t#{seq.seq_fasta.length}\t\ttestcode\t#{ref_status}\t#{ref_code}\t\t\t\t\t#{ref_msgs}\t#{ref_frame}\t#{ref_start}\t#{ref_end}\t\t\t\t",true)
|
30
|
+
|
31
|
+
else
|
32
|
+
|
33
|
+
# para probar tescode con toda la secuencia, en lugar de con los ORFs ----------------------------------------------------------------------
|
34
|
+
# sense_strand = seq.seq_fasta.upcase
|
35
|
+
# antisense_strand = sense_strand.complementary_dna
|
36
|
+
# (t_code,t_status) = testCode_exec(sense_strand)
|
37
|
+
# ref_frame = 1
|
38
|
+
# ref_msgs = ''
|
39
|
+
# (as_t_code,as_t_status) = testCode_exec(antisense_strand)
|
40
|
+
# # puts "#{seq.seq_name}: t_code: #{t_code}, t_status: #{t_status}, as_t_code: #{as_t_code}, as_t_status: #{as_t_status}"
|
41
|
+
# seq.annotate(:tcode,"#{seq.seq_name}\t#{seq.seq_fasta.length}\t\ttestcode\t#{t_status}\t#{t_code}\t\t\t\t\t#{ref_msgs}\t#{ref_frame}\t#{ref_start}\t#{ref_end}\t\t\t\t",true)
|
42
|
+
# --------------------------------------------------------------------------------------------------------------------------------
|
43
|
+
|
44
|
+
|
45
|
+
# see add_region filter
|
46
|
+
(name,t_code,status,ref_start,ref_end,ref_frame,orf,ref_msgs,stop_before_start,more_than_one_frame) = t_code(seq)
|
47
|
+
seq.annotate(:tcode,"#{name}\t#{seq.seq_fasta.length}\t\ttestcode\t#{status}\t#{t_code}\t\t\t\t\t#{ref_msgs}\t#{ref_frame}\t#{ref_start}\t#{ref_end}\t\t\t\t",true)
|
48
|
+
|
49
|
+
# if (ref_msgs.nil?)
|
50
|
+
# ref_msgs = ''
|
51
|
+
# end
|
52
|
+
#
|
53
|
+
# if (stop_before_start)
|
54
|
+
# ref_msgs += "There is a STOP codon before ATG. "
|
55
|
+
# end
|
56
|
+
#
|
57
|
+
# if (more_than_one_frame)
|
58
|
+
# ref_msgs += "Possible frame error by an ins/del"
|
59
|
+
# end
|
60
|
+
#
|
61
|
+
# if (status.to_s =~ /^[CN]\_terminus/) || (status == :Internal)
|
62
|
+
# tmp_status = "Putative #{status.to_s}"
|
63
|
+
# else
|
64
|
+
# tmp_status = status.to_s
|
65
|
+
# end
|
66
|
+
#
|
67
|
+
#
|
68
|
+
# if (!orf.nil?) && (!more_than_one_frame)
|
69
|
+
# protein = orf.translate
|
70
|
+
# p_long = protein.length - 3
|
71
|
+
# seq.annotate(:tcode,"#{name}\t#{seq.seq_fasta.length}\t\ttestcode\t#{tmp_status}\t#{t_code}\t\t\t#{p_long}\t\t#{ref_msgs}\t#{ref_frame}\t#{ref_start}\t#{ref_end}\t\t\t\t#{protein}",true)
|
72
|
+
# else
|
73
|
+
# seq.annotate(:tcode,"#{name}\t#{seq.seq_fasta.length}\t\ttestcode\t#{tmp_status}\t#{t_code}\t\t\t\t\t#{ref_msgs}\t#{ref_frame}\t#{ref_start}\t#{ref_end}\t\t\t\t",true)
|
74
|
+
# end
|
75
|
+
|
76
|
+
end
|
77
|
+
end
|
78
|
+
|
79
|
+
def t_code(seq)
|
80
|
+
|
81
|
+
name = seq.seq_name
|
82
|
+
tc_fasta = seq.seq_fasta
|
83
|
+
|
84
|
+
# generamos todos los ORFs de cada secuencia
|
85
|
+
uncomplete_orf_finder(seq)
|
86
|
+
minus_strand = []
|
87
|
+
plus_strand = []
|
88
|
+
|
89
|
+
# puts "**************************************************"
|
90
|
+
# puts "#{name} #{tc_fasta.length}"
|
91
|
+
# ordenamos los ORFs empezando desde 5' a 3' y luego separamos los de cada hebra en un array distinto
|
92
|
+
seq.orfs.sort{|x,y| x.t_start <=> y.t_start }.each do |one_orf|
|
93
|
+
|
94
|
+
(t_code,t_status)=testCode_exec(one_orf.seq)
|
95
|
+
one_orf.status = t_status
|
96
|
+
one_orf.score = t_code
|
97
|
+
|
98
|
+
if (one_orf.frame < 0)
|
99
|
+
minus_strand.push one_orf
|
100
|
+
elsif (one_orf.frame > 0)
|
101
|
+
plus_strand.push one_orf
|
102
|
+
end
|
103
|
+
|
104
|
+
end
|
105
|
+
#----------------------------------- Plus strand
|
106
|
+
if (!plus_strand.empty?)
|
107
|
+
# puts "--------------Plus strand:"
|
108
|
+
best_plus_region = compare_regions(plus_strand,tc_fasta)
|
109
|
+
if (!best_plus_region.nil?)
|
110
|
+
# puts "#{best_plus_region.seq.length}, #{best_plus_region.status}, #{best_plus_region.type}, #{best_plus_region.t_start} - #{best_plus_region.t_end}, #{best_plus_region.frame}"
|
111
|
+
# puts best_plus_region.seq
|
112
|
+
end
|
113
|
+
end
|
114
|
+
#----------------------------------- Minus strand
|
115
|
+
if (!minus_strand.empty?)
|
116
|
+
# puts "--------------Minus strand:"
|
117
|
+
best_minus_region = compare_regions(minus_strand,tc_fasta)
|
118
|
+
if (!best_minus_region.nil?)
|
119
|
+
# puts "#{best_minus_region.seq.length}, #{best_minus_region.status}, #{best_minus_region.type}, #{best_minus_region.t_start} - #{best_minus_region.t_end}, #{best_minus_region.frame}"
|
120
|
+
# puts best_minus_region.seq
|
121
|
+
end
|
122
|
+
end
|
123
|
+
# obtenemos la region codificante mas larga de ambas hebras
|
124
|
+
best_region = nil
|
125
|
+
if (!best_plus_region.nil?)
|
126
|
+
if (!best_minus_region.nil?)
|
127
|
+
if (best_minus_region.seq.length > best_plus_region.seq.length)
|
128
|
+
best_region = best_minus_region
|
129
|
+
else
|
130
|
+
best_region = best_plus_region
|
131
|
+
end
|
132
|
+
else
|
133
|
+
best_region = best_plus_region
|
134
|
+
end
|
135
|
+
elsif (!best_minus_region.nil?)
|
136
|
+
best_region = best_minus_region
|
137
|
+
end
|
138
|
+
# puts "--------------Best region:"
|
139
|
+
# puts " --------------------------------- #{best_region.seq.length}, #{best_region.status}, #{best_region.type}, #{best_region.t_start} - #{best_region.t_end}, #{best_region.frame}"
|
140
|
+
# comprobamos el tipo de ORF segun si tiene un codon de parada antes del atg
|
141
|
+
|
142
|
+
if (!best_region.nil?) && (best_region.seq.length >= 200)
|
143
|
+
if (best_region.type == :Complete) && (!best_region.stop_codon)
|
144
|
+
best_region.type = 'Putative Complete'
|
145
|
+
ref_msgs = 'NO STOP codon before ATG. '
|
146
|
+
elsif (best_region.type == :N_terminus) && (!best_region.stop_codon)
|
147
|
+
ref_msgs = 'NO STOP codon before ATG. '
|
148
|
+
end
|
149
|
+
return [name, best_region.score, best_region.type, best_region.t_start, best_region.t_end, best_region.frame, best_region.seq, ref_msgs, best_region.stop_codon, best_region.more_than_one_frame]
|
150
|
+
else
|
151
|
+
ref_score = 0.0
|
152
|
+
ref_start = 0
|
153
|
+
ref_end = 0
|
154
|
+
ref_frame = 0
|
155
|
+
ref_orf = ''
|
156
|
+
ref_type = 'unknown'
|
157
|
+
ref_msgs = 'Non coding ORF found >= 200 nt '
|
158
|
+
return [name, ref_score, ref_type, ref_start, ref_end, ref_frame, ref_orf, ref_msgs, false, false]
|
159
|
+
end
|
160
|
+
end
|
161
|
+
|
162
|
+
# cuando se unen dos regiones de diferentes frames, nos dice de que tipo es la union, Complete, Internal, N-terminus...
|
163
|
+
def type_fusion(prev_type,one_type)
|
164
|
+
res_type = :Internal
|
165
|
+
if (prev_type == :C_terminus) #-------------- C-terminus
|
166
|
+
if (one_type == :N_terminus)
|
167
|
+
res_type = :Internal
|
168
|
+
elsif (one_type == :Internal)
|
169
|
+
res_type = :Internal
|
170
|
+
elsif (one_type == :Complete)
|
171
|
+
res_type = :C_terminus
|
172
|
+
end
|
173
|
+
elsif (prev_type == :N_terminus) #-------------- N-terminus
|
174
|
+
if (one_type == :C_terminus)
|
175
|
+
res_type = :Complete
|
176
|
+
elsif (one_type == :Internal)
|
177
|
+
res_type = :N_terminus
|
178
|
+
elsif (one_type == :Complete)
|
179
|
+
res_type = :Complete
|
180
|
+
end
|
181
|
+
elsif (prev_type == :Internal) #-------------- Internal
|
182
|
+
if (one_type == :C_terminus)
|
183
|
+
res_type = :C_terminus
|
184
|
+
elsif (one_type == :N_terminus)
|
185
|
+
res_type = :Internal
|
186
|
+
elsif (one_type == :Complete)
|
187
|
+
res_type = :C_terminus
|
188
|
+
end
|
189
|
+
elsif (prev_type == :Complete) #-------------- Complete
|
190
|
+
if (one_type == :C_terminus)
|
191
|
+
res_type = :Complete
|
192
|
+
elsif (one_type == :N_terminus)
|
193
|
+
res_type = :N_terminus
|
194
|
+
elsif (one_type == :Internal)
|
195
|
+
res_type = :N_terminus
|
196
|
+
end
|
197
|
+
end
|
198
|
+
return res_type
|
199
|
+
end
|
200
|
+
|
201
|
+
# para escoger la region codificante mas grande, incluso solapando verios frames
|
202
|
+
def compare_regions(this_strand,tc_fasta)
|
203
|
+
|
204
|
+
# select the largest ORF without frame repair-------------------------------- hay que comentar todo el bloque de arriba
|
205
|
+
best_orf = nil
|
206
|
+
best_orf = this_strand.sort{|x,y| y.seq.length <=> x.seq.length }[0]
|
207
|
+
|
208
|
+
if !best_orf.nil?
|
209
|
+
best_orf.type = best_orf.status
|
210
|
+
end
|
211
|
+
|
212
|
+
return best_orf
|
213
|
+
|
214
|
+
end
|
215
|
+
|
216
|
+
# to add regions over a determinated size
|
217
|
+
def add_region(orf_seq, orf_t_start, orf_t_end, orf_frame, orf_stop_codon, orf_type,seq)
|
218
|
+
|
219
|
+
if (orf_seq.length >= 200)
|
220
|
+
# puts "#{seq.seq_name}, #{orf_t_start} - #{orf_t_end}, #{orf_frame}, #{orf_type}"
|
221
|
+
seq.add_orf(orf_seq, orf_t_start, orf_t_end, orf_frame, orf_stop_codon, orf_type)
|
222
|
+
end
|
223
|
+
|
224
|
+
end
|
225
|
+
|
226
|
+
# se buscan las regiones Complete, N-terminus, C-terminus e Internal en la secuencia de cada frame
|
227
|
+
def generate_uncomplete_orf(a,frame,seq)
|
228
|
+
|
229
|
+
my_atg = true
|
230
|
+
atg_codon = false
|
231
|
+
stop_codon = false
|
232
|
+
any_stop = false
|
233
|
+
|
234
|
+
orf =''
|
235
|
+
t_start = 0
|
236
|
+
t_end = 0
|
237
|
+
|
238
|
+
a.each do |e|
|
239
|
+
t_end += 3
|
240
|
+
orf += e
|
241
|
+
|
242
|
+
if (e == 'ATG') && (!atg_codon)
|
243
|
+
atg_codon = true
|
244
|
+
stop_codon = false
|
245
|
+
t_start = t_end - 3
|
246
|
+
elsif (e == 'TAG') || (e == 'TGA') || (e == 'TAA')
|
247
|
+
orf_tmp = orf[t_start..t_end]
|
248
|
+
if (any_stop)
|
249
|
+
# case 1, complete orf
|
250
|
+
if (atg_codon)
|
251
|
+
orf_tmp = orf[t_start..t_end]
|
252
|
+
add_region(orf_tmp, t_start, t_end, frame, any_stop, :Complete,seq)
|
253
|
+
end
|
254
|
+
else
|
255
|
+
# case 2, C_terminus
|
256
|
+
if (my_atg)
|
257
|
+
orf_tmp = orf[0..t_end]
|
258
|
+
add_region(orf_tmp, 0, t_end, frame, any_stop, :C_terminus,seq)
|
259
|
+
# case 3, putative complete, complete without stop codon before atg
|
260
|
+
if (atg_codon)
|
261
|
+
orf_tmp = orf[t_start..t_end]
|
262
|
+
add_region(orf_tmp, t_start, t_end, frame, any_stop, :Complete, seq)
|
263
|
+
end
|
264
|
+
end
|
265
|
+
end
|
266
|
+
|
267
|
+
stop_codon = true
|
268
|
+
any_stop = true
|
269
|
+
my_atg = false
|
270
|
+
atg_codon = false
|
271
|
+
t_start += 3
|
272
|
+
end
|
273
|
+
end
|
274
|
+
|
275
|
+
# case 4, N_terminus and case 6, putative N_terminus
|
276
|
+
if (atg_codon) && (!stop_codon)
|
277
|
+
orf_tmp = orf[t_start..t_end]
|
278
|
+
add_region(orf_tmp, t_start, t_end, frame, any_stop, :N_terminus,seq)
|
279
|
+
end
|
280
|
+
|
281
|
+
# case 5, internal
|
282
|
+
if (my_atg) && (!any_stop)
|
283
|
+
orf_tmp = orf[0..t_end]
|
284
|
+
add_region(orf_tmp, 0, t_end, frame, any_stop, :Internal,seq)
|
285
|
+
end
|
286
|
+
|
287
|
+
end
|
288
|
+
|
289
|
+
# recorre cada uno de los frames y los pasa a generate_uncomplete_orf
|
290
|
+
def uncomplete_orf_finder(seq)
|
291
|
+
|
292
|
+
s = seq.seq_fasta.upcase
|
293
|
+
f1 = s.split('').each_slice(3).map{|e| e.join}
|
294
|
+
generate_uncomplete_orf(f1,1,seq)
|
295
|
+
|
296
|
+
s.sub!(/^./,'')
|
297
|
+
f2 = s.split('').each_slice(3).map{|e| e.join}
|
298
|
+
generate_uncomplete_orf(f2,2,seq)
|
299
|
+
|
300
|
+
s.sub!(/^./,'')
|
301
|
+
f3 = s.split('').each_slice(3).map{|e| e.join}
|
302
|
+
generate_uncomplete_orf(f3,3,seq)
|
303
|
+
|
304
|
+
# vamos a por los ORFs de la cadena complementaria
|
305
|
+
s = seq.seq_fasta.upcase
|
306
|
+
s = s.complementary_dna
|
307
|
+
|
308
|
+
f4 = s.split('').each_slice(3).map{|e| e.join}
|
309
|
+
generate_uncomplete_orf(f4,-1,seq)
|
310
|
+
|
311
|
+
s.sub!(/^./,'')
|
312
|
+
f5 = s.split('').each_slice(3).map{|e| e.join}
|
313
|
+
generate_uncomplete_orf(f5,-2,seq)
|
314
|
+
|
315
|
+
s.sub!(/^./,'')
|
316
|
+
f6 = s.split('').each_slice(3).map{|e| e.join}
|
317
|
+
generate_uncomplete_orf(f6,-3,seq)
|
318
|
+
|
319
|
+
end
|
320
|
+
|
321
|
+
|
322
|
+
|
323
|
+
|
324
|
+
|
325
|
+
|
326
|
+
|
327
|
+
|
328
|
+
# | = ATG
|
329
|
+
# * = STOP
|
330
|
+
#
|
331
|
+
# -------*---|>>>>>>>>>>>>>> 1 >>>>>>>>>>>>*-------------
|
332
|
+
# >>>>>>>>>> 2 >>>>>>>>>>>>>*----------------------------
|
333
|
+
# ---|>>>>>>>> 3 >>>>>>>>>>>>*---------------------------
|
334
|
+
# -------------------*-----------------|>>>>>>>>>> 4 >>>>
|
335
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>> 5 >>>>>>>>>>>>>>>>>>>>>>>>>>
|
336
|
+
#--------------------------- 6 ----------|>>>>>>>>>>>>>>>
|
337
|
+
#
|
338
|
+
# 1 complete orf, with stop codon before atg
|
339
|
+
# 2 C-terminus
|
340
|
+
# 3 putative complete, complete without stop codon before atg
|
341
|
+
# 4 N_terminus, with stop codon before atg
|
342
|
+
# 5 internal
|
343
|
+
# 6 putative N_terminus, N_terminus without stop codon before atg
|
344
|
+
|
345
|
+
# no se usa
|
346
|
+
def orf_fusion(orfs_array,tc_fasta,name,ref_status,ref_code,ref_start,ref_end,ref_frame,ref_orf,ref_msgs)
|
347
|
+
|
348
|
+
lower_start = 9999
|
349
|
+
higher_end = 0
|
350
|
+
new_orf = false
|
351
|
+
ref_name = name
|
352
|
+
|
353
|
+
# vamos a coger el mejor de referencia (el coding mas largo)
|
354
|
+
# y vamos a poner en el warning los coding que estan en el mismo sentido y no estan contenidos en el mejor
|
355
|
+
orfs_array.sort! {|orf1,orf2| orf1[1] <=> orf2[1]}
|
356
|
+
|
357
|
+
tmp_orf = ref_orf
|
358
|
+
tmp_start = ref_start
|
359
|
+
tmp_end = ref_end
|
360
|
+
tmp_frame = ref_frame
|
361
|
+
tmp_msg = ''
|
362
|
+
|
363
|
+
# puts "\n\n#{name} ---- tmp_frame: #{tmp_frame} ------------------------ \n\n"
|
364
|
+
|
365
|
+
orfs_array.each do |orf|
|
366
|
+
|
367
|
+
(orf[1],orf[2]) = corrige_frame(orf[3],orf[1],orf[2])
|
368
|
+
# (orf[1],orf[2]) = $an.corrige_frame(orf[3],orf[1],orf[2])
|
369
|
+
|
370
|
+
# puts "*** name: #{name}, orf[1]: #{orf[1]}, orf[2]: #{orf[2]}, orf[3]: #{orf[3]} ref_start: #{ref_start}, ref_end: #{ref_end}, ref_frame: #{ref_frame} ***\n\n"
|
371
|
+
|
372
|
+
if (orf[0] != tmp_orf)
|
373
|
+
|
374
|
+
if ((tmp_end >= orf[1]) && (tmp_end <= orf[2])) || ((tmp_start >= orf[1]) && (tmp_start <= orf[2])) # los ORFs solapan
|
375
|
+
|
376
|
+
# puts "SOLAPAN frame: #{orf[3]} tmp_start: #{tmp_start} tmp_end: #{tmp_end} orf_start: #{orf[1]} orf_end: #{orf[2]}"
|
377
|
+
if (tmp_frame > 0)
|
378
|
+
tmp_msg = ", overlapping coding region (#{orf[1]},#{orf[2]})"
|
379
|
+
elsif (tmp_frame < 0)
|
380
|
+
tmp_msg = ", overlapping coding region (-#{orf[1]},-#{orf[2]})"
|
381
|
+
end
|
382
|
+
new_orf = true
|
383
|
+
|
384
|
+
elsif (tmp_end < orf[1]) || (tmp_start > orf[2]) # los ORFs estan separados
|
385
|
+
|
386
|
+
# puts "#{name} frame: #{orf[3]} SEPARADOS --> tmp_start: #{tmp_start} tmp_end: #{tmp_end} orf_start: #{orf[1]} orf_end: #{orf[2]}"
|
387
|
+
if (tmp_frame > 0)
|
388
|
+
tmp_msg = ", other coding region (#{orf[1]},#{orf[2]})"
|
389
|
+
elsif (tmp_frame < 0)
|
390
|
+
tmp_msg = ", other coding region (-#{orf[1]},-#{orf[2]})"
|
391
|
+
end
|
392
|
+
new_orf = true
|
393
|
+
|
394
|
+
end
|
395
|
+
|
396
|
+
if (new_orf == true)
|
397
|
+
|
398
|
+
if (orf[1] < lower_start)
|
399
|
+
lower_start = orf[1]
|
400
|
+
end
|
401
|
+
|
402
|
+
# if (orf[2] > higher_end)
|
403
|
+
# end
|
404
|
+
|
405
|
+
(tmp_code,tmp_status)=testCode_exec(tc_fasta[lower_start-1..higher_end-1])
|
406
|
+
|
407
|
+
if (tmp_status != 'unknown')
|
408
|
+
# puts "#{name} ----------------------------------- FUSION!!!!!!!!!!!!!!!!\n\n"
|
409
|
+
# tenemos varios ORFs q son codificantes al unirlos
|
410
|
+
ref_msgs += tmp_msg
|
411
|
+
new_orf = false
|
412
|
+
else
|
413
|
+
# puts "#{name} ------------------------------------NO se unen\n\n"
|
414
|
+
end
|
415
|
+
end
|
416
|
+
end
|
417
|
+
end
|
418
|
+
return[ref_status,ref_code,ref_name,ref_start,ref_end,ref_frame,ref_orf,ref_msgs]
|
419
|
+
end
|
420
|
+
# no se usa
|
421
|
+
def t_code_old(seq)
|
422
|
+
|
423
|
+
ref_code = 0.0
|
424
|
+
ref_name = ''
|
425
|
+
ref_start = 0
|
426
|
+
ref_end = 0
|
427
|
+
ref_frame = 0
|
428
|
+
ref_status = ''
|
429
|
+
ref_orf = ''
|
430
|
+
ref_msgs = ''
|
431
|
+
|
432
|
+
name = seq.seq_name
|
433
|
+
tc_fasta = seq.seq_fasta
|
434
|
+
|
435
|
+
# generamos todos los ORFs mayores de 200pb de cada secuencia, en fl_string_utils
|
436
|
+
orfs_array = tc_fasta.orf_finder
|
437
|
+
|
438
|
+
if (orfs_array[0].nil?)
|
439
|
+
ref_name = name
|
440
|
+
ref_code = 0.0
|
441
|
+
ref_start = 0
|
442
|
+
ref_end = 0
|
443
|
+
ref_frame = 0
|
444
|
+
ref_status = 'unknown'
|
445
|
+
ref_orf = ''
|
446
|
+
ref_msgs = 'ORF length < 200 nt'
|
447
|
+
# ref_msgs = 'Your sequence has not an ORF longer than 200 nt'
|
448
|
+
|
449
|
+
else
|
450
|
+
one_good_orf_minus = false
|
451
|
+
more_than_one_minus = false
|
452
|
+
one_good_orf_plus = false
|
453
|
+
more_than_one_plus = false
|
454
|
+
|
455
|
+
good_orfs_minus = []
|
456
|
+
good_orfs_plus = []
|
457
|
+
|
458
|
+
one_coding = false
|
459
|
+
one_putative = false
|
460
|
+
|
461
|
+
# orfs_array.sort! {|orf1,orf2| (orf1[2] - orf1[1]) <=> (orf2[2] - orf2[1])}
|
462
|
+
orfs_array.each do |orf|
|
463
|
+
|
464
|
+
# long = orf.length - 1
|
465
|
+
if (orf[0])
|
466
|
+
# if (long >= 200)
|
467
|
+
(t_code,t_status)=testCode_exec(orf[0])
|
468
|
+
# puts "name: #{name},t_status: #{t_status}, t_code: #{t_code}, stop_codon: #{orf[4]}, length: #{tc_fasta.length}, start: #{orf[1]}, end: #{orf[2]}, frame: #{orf[3]}\n\n"
|
469
|
+
|
470
|
+
if (t_status != 'unknown')
|
471
|
+
if (orf[3].to_i < 0)
|
472
|
+
if (one_good_orf_minus == true)
|
473
|
+
more_than_one_minus = true
|
474
|
+
orf.push t_code
|
475
|
+
orf.push t_status
|
476
|
+
good_orfs_minus.push orf
|
477
|
+
else
|
478
|
+
one_good_orf_minus = true
|
479
|
+
orf.push t_code #orf[5]
|
480
|
+
orf.push t_status #orf[6]
|
481
|
+
good_orfs_minus.push orf
|
482
|
+
end
|
483
|
+
elsif (orf[3].to_i > 0)
|
484
|
+
if (one_good_orf_plus == true)
|
485
|
+
more_than_one_plus = true
|
486
|
+
orf.push t_code
|
487
|
+
orf.push t_status
|
488
|
+
good_orfs_plus.push orf
|
489
|
+
else
|
490
|
+
one_good_orf_plus = true
|
491
|
+
orf.push t_code #orf[5]
|
492
|
+
orf.push t_status #orf[6]
|
493
|
+
good_orfs_plus.push orf
|
494
|
+
end
|
495
|
+
end
|
496
|
+
end
|
497
|
+
|
498
|
+
# if (t_code.to_f > ref_code) # cogemos el de mejor testcode
|
499
|
+
# puts "name: #{name}, orf[0].length: #{orf[0].length}, ref_orf.length: #{ref_orf.length}, t_status: #{t_status}, one_coding: #{one_coding}\n\n"
|
500
|
+
if (orf[0].length > ref_orf.length) and (t_status == 'coding') # cogemos el mayor de los coding
|
501
|
+
# puts "compleeeeeeeeeeeeeeetttttttttttaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"
|
502
|
+
ref_code = t_code.to_f
|
503
|
+
ref_name = name
|
504
|
+
ref_orf = orf[0]
|
505
|
+
ref_start = orf[1]
|
506
|
+
ref_end = orf[2]
|
507
|
+
ref_frame = orf[3]
|
508
|
+
|
509
|
+
one_coding = true
|
510
|
+
(ref_status,ref_msgs) = checking_beginning(orf[4],t_status,orf[1],tc_fasta,orf[2])
|
511
|
+
|
512
|
+
elsif (one_coding == false) and (orf[0].length > ref_orf.length) and (t_status == 'putative_coding')
|
513
|
+
# puts "putaaaaaaaaaaaaaaaaaaativeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeee"
|
514
|
+
ref_code = t_code.to_f
|
515
|
+
ref_name = name
|
516
|
+
ref_orf = orf[0]
|
517
|
+
ref_start = orf[1]
|
518
|
+
ref_end = orf[2]
|
519
|
+
ref_frame = orf[3]
|
520
|
+
|
521
|
+
one_putative = true
|
522
|
+
(ref_status,ref_msgs) = checking_beginning(orf[4],t_status,orf[1],tc_fasta,orf[2])
|
523
|
+
|
524
|
+
elsif (one_coding == false) and (one_putative == false)
|
525
|
+
# puts "unknoooooooooooooooooooooooooooooooooooooooooooooooooooooooooooown"
|
526
|
+
ref_status = t_status
|
527
|
+
ref_code = t_code.to_f
|
528
|
+
ref_name = name
|
529
|
+
ref_orf = orf[0]
|
530
|
+
ref_start = orf[1]
|
531
|
+
ref_end = orf[2]
|
532
|
+
ref_frame = orf[3]
|
533
|
+
|
534
|
+
ref_msgs = 'Bad testcode score'
|
535
|
+
# ref_msgs = 'test code did not find a coding region in your sequence'
|
536
|
+
end
|
537
|
+
# puts "**name: #{name},ref_status: #{ref_status}, ref_code: #{ref_code}, length: #{tc_fasta.length}, ref_start: #{ref_start}, ref_end: #{ref_end}, ref_frame: #{ref_frame}\n\n"
|
538
|
+
# puts "t_code: #{t_code}, t_status: #{t_status}"
|
539
|
+
end
|
540
|
+
end
|
541
|
+
|
542
|
+
# vamos a preparar el mejor ORF y devolverlo
|
543
|
+
(ref_start,ref_end) = corrige_frame(ref_frame,ref_start,ref_end)
|
544
|
+
# puts "**name: #{name}, ref_start: #{ref_start}, ref_end: #{ref_end}, ref_frame: #{ref_frame}\n\n"
|
545
|
+
|
546
|
+
# si encontramos más de un orf valido
|
547
|
+
if (more_than_one_plus) or (more_than_one_minus)
|
548
|
+
if (ref_frame > 0)
|
549
|
+
if (more_than_one_plus)
|
550
|
+
(ref_status,ref_code,ref_name,ref_start,ref_end,ref_frame,ref_orf,ref_msgs) = orf_fusion(good_orfs_plus,tc_fasta,name,ref_status,ref_code,ref_start,ref_end,ref_frame,ref_orf,ref_msgs)
|
551
|
+
# if (ref_frame == 0)
|
552
|
+
# ref_frame = 7
|
553
|
+
# end
|
554
|
+
# puts "merging ORFs with orf_fusion!!!!"
|
555
|
+
end
|
556
|
+
elsif (ref_frame < 0)
|
557
|
+
if (more_than_one_minus)
|
558
|
+
(ref_status,ref_code,ref_name,ref_start,ref_end,ref_frame,ref_orf,ref_msgs) = orf_fusion(good_orfs_minus,tc_fasta,name,ref_status,ref_code,ref_start,ref_end,ref_frame,ref_orf,ref_msgs)
|
559
|
+
# if (ref_frame == 0)
|
560
|
+
# ref_frame = -7
|
561
|
+
# end
|
562
|
+
# puts "merging ORFs with orf_fusion!!!!"
|
563
|
+
end
|
564
|
+
end
|
565
|
+
end
|
566
|
+
|
567
|
+
# puts "name: #{name},t_status: #{ref_status}, t_code: #{ref_code}, frame: #{ref_frame},start: #{ref_start}, end: #{ref_end}\n\n"
|
568
|
+
|
569
|
+
if (ref_frame < 0)
|
570
|
+
(kk1,kk2,ref_start2,ref_end2) = reverse_seq(tc_fasta,ref_frame,ref_start,ref_end)
|
571
|
+
|
572
|
+
ref_start_ok = "#{ref_start2} (-#{ref_start})"
|
573
|
+
ref_end_ok = "#{ref_end2} (-#{ref_end})"
|
574
|
+
else
|
575
|
+
ref_start_ok = ref_start
|
576
|
+
ref_end_ok = ref_end
|
577
|
+
end
|
578
|
+
end
|
579
|
+
|
580
|
+
return [ref_name,ref_code,ref_status,ref_start_ok,ref_end_ok,ref_frame,ref_orf,ref_msgs]
|
581
|
+
|
582
|
+
end
|
583
|
+
# no se usa
|
584
|
+
# gnuplot must be installed
|
585
|
+
def window_walking(seq)
|
586
|
+
|
587
|
+
tcode_array = []
|
588
|
+
y1=[]
|
589
|
+
y2=[]
|
590
|
+
|
591
|
+
s = seq.seq_fasta.upcase
|
592
|
+
(s.length-200).times do |i|
|
593
|
+
(t_code,t_status)=testCode_exec(s[i..i+199])
|
594
|
+
y1.push t_code
|
595
|
+
end
|
596
|
+
|
597
|
+
tcode_array_rc = []
|
598
|
+
|
599
|
+
src = s.complementary_dna
|
600
|
+
(src.length-200).times do |i|
|
601
|
+
(t_code,t_status)=testCode_exec(src[i..i+199])
|
602
|
+
# puts "#{i}-#{i+199}"
|
603
|
+
# puts t_status
|
604
|
+
# puts t_code
|
605
|
+
puts src[i..i+199]
|
606
|
+
y2.push t_code
|
607
|
+
end
|
608
|
+
|
609
|
+
# Create lines plot
|
610
|
+
p=ScbiPlot::Lines.new('lines.png','title')
|
611
|
+
x=(1..src.length-200).entries
|
612
|
+
p.add_x(x)
|
613
|
+
|
614
|
+
# puts "x_length: #{src.length-200} #{x}"
|
615
|
+
# puts "x: #{x.length}, y: #{y1.length}"
|
616
|
+
p.add_series('serie0', y1)
|
617
|
+
|
618
|
+
p.do_graph
|
619
|
+
|
620
|
+
end
|
621
|
+
# no se usa
|
622
|
+
# para escoger la region codificante mas grande, incluso solapando verios frames
|
623
|
+
def compare_regions_old(this_strand,tc_fasta)
|
624
|
+
|
625
|
+
# array_regions = []
|
626
|
+
# array_regions.push this_strand.first.dup
|
627
|
+
#
|
628
|
+
# this_strand.each do |one_orf|
|
629
|
+
#
|
630
|
+
# prev_orf = array_regions.last.dup # taking last orf before add actual orf
|
631
|
+
# array_regions.push one_orf.dup # add actual orf, el primer orf me va a salir duplicado
|
632
|
+
#
|
633
|
+
# if prev_orf.overlaps?(one_orf)
|
634
|
+
# prev_orf.seq = tc_fasta[prev_orf.t_start..one_orf.t_end]
|
635
|
+
#
|
636
|
+
# (t_code,t_status)=testCode_exec(prev_orf.seq)
|
637
|
+
#
|
638
|
+
# prev_orf.score = t_code
|
639
|
+
# prev_orf.status = t_status
|
640
|
+
#
|
641
|
+
# #t_start, frame and stop_codon are the same
|
642
|
+
# prev_orf.t_end = one_orf.t_end
|
643
|
+
# if (prev_orf.type != one_orf.type)
|
644
|
+
# prev_orf.type = type_fusion(prev_orf.type,one_orf.type)
|
645
|
+
# end
|
646
|
+
# prev_orf.more_than_one_frame = true
|
647
|
+
#
|
648
|
+
# # puts "overlaps:"
|
649
|
+
# # puts "#{one_orf.t_start} - #{one_orf.t_end}"
|
650
|
+
# # puts "#{array_regions.last.t_start} - #{array_regions.last.t_end}"
|
651
|
+
#
|
652
|
+
# array_regions.push prev_orf # add overlapped orf,
|
653
|
+
# end
|
654
|
+
#
|
655
|
+
# end
|
656
|
+
#
|
657
|
+
# # select the largest ORF without frame repair-------------------------------- hay que comentar todo el bloque de arriba
|
658
|
+
# # best_orf = nil
|
659
|
+
# # best_orf = this_strand.sort{|x,y| y.seq.length <=> x.seq.length }[0]
|
660
|
+
# # ---------------------------------------------------------------------------
|
661
|
+
#
|
662
|
+
# # select the largest ORF ----------------------------------------------------
|
663
|
+
# # best_orf = nil
|
664
|
+
# # best_orf = array_regions.sort{|x,y| y.seq.length <=> x.seq.length }[0]
|
665
|
+
# # if !best_orf.nil?
|
666
|
+
# # puts "best_orf.status: #{best_orf.status}, best_orf.type: #{best_orf.type}"
|
667
|
+
# # end
|
668
|
+
#
|
669
|
+
#
|
670
|
+
# # ---------------------------------------------------------------------------
|
671
|
+
#
|
672
|
+
# # select the largest coding ORF ---------------------------------------------
|
673
|
+
# best_orf = nil
|
674
|
+
# array_regions.sort{|x,y| y.seq.length <=> x.seq.length }.each do |one_orf|
|
675
|
+
# # puts "#{one_orf.status}, #{one_orf.type}"
|
676
|
+
# if (one_orf.status == :coding)
|
677
|
+
# best_orf = one_orf
|
678
|
+
# break
|
679
|
+
# end
|
680
|
+
# end
|
681
|
+
# # ---------------------------------------------------------------------------
|
682
|
+
#
|
683
|
+
# # puts "\n\n\nstart"
|
684
|
+
# # array_regions.sort{|x,y| y.seq.length <=> x.seq.length }.each do |one_orf|
|
685
|
+
# # puts one_orf.seq.length
|
686
|
+
# # end
|
687
|
+
# # puts "end\n\n\n"
|
688
|
+
# # puts "best_orf: #{best_orf.seq.length}"
|
689
|
+
#
|
690
|
+
# # if !best_orf.nil?
|
691
|
+
# # best_orf.type = best_orf.status
|
692
|
+
# # end
|
693
|
+
#
|
694
|
+
# return best_orf
|
695
|
+
|
696
|
+
end
|
697
|
+
|
698
|
+
###### test code program functions #########
|
699
|
+
|
700
|
+
def testCode_exec (sequence)
|
701
|
+
|
702
|
+
sequence.downcase!
|
703
|
+
basesOne = [0,0,0,0];
|
704
|
+
basesTwo = [0,0,0,0];
|
705
|
+
basesThree = [0,0,0,0];
|
706
|
+
|
707
|
+
#for (j = 0; j < sequence.length; j = j + 3)
|
708
|
+
|
709
|
+
0.step(sequence.length-1,3) do |j|
|
710
|
+
if (sequence[j].chr == "g")
|
711
|
+
basesOne[0] = basesOne[0] + 1;
|
712
|
+
elsif (sequence[j].chr == "a")
|
713
|
+
basesOne[1] = basesOne[1] + 1;
|
714
|
+
elsif (sequence[j].chr == "t")
|
715
|
+
basesOne[2] = basesOne[2] + 1;
|
716
|
+
elsif (sequence[j].chr == "c")
|
717
|
+
basesOne[3] = basesOne[3] + 1;
|
718
|
+
else
|
719
|
+
end
|
720
|
+
end
|
721
|
+
|
722
|
+
#for (j = 1; j < sequence.length; j = j + 3)
|
723
|
+
1.step(sequence.length-1,3) do |j|
|
724
|
+
if (sequence[j].chr == "g")
|
725
|
+
basesTwo[0] = basesTwo[0] + 1;
|
726
|
+
elsif (sequence[j].chr == "a")
|
727
|
+
basesTwo[1] = basesTwo[1] + 1;
|
728
|
+
elsif (sequence[j].chr == "t")
|
729
|
+
basesTwo[2] = basesTwo[2] + 1;
|
730
|
+
elsif (sequence[j].chr == "c")
|
731
|
+
basesTwo[3] = basesTwo[3] + 1;
|
732
|
+
else
|
733
|
+
end
|
734
|
+
end
|
735
|
+
|
736
|
+
|
737
|
+
#for (j = 2; j < sequence.length; j = j + 3)
|
738
|
+
2.step(sequence.length-1,3) do |j|
|
739
|
+
if (sequence[j].chr == "g")
|
740
|
+
basesThree[0] = basesThree[0] + 1;
|
741
|
+
elsif (sequence[j].chr == "a")
|
742
|
+
basesThree[1] = basesThree[1] + 1;
|
743
|
+
elsif (sequence[j].chr == "t")
|
744
|
+
basesThree[2] = basesThree[2] + 1;
|
745
|
+
elsif (sequence[j].chr == "c")
|
746
|
+
basesThree[3] = basesThree[3] + 1;
|
747
|
+
else
|
748
|
+
end
|
749
|
+
end
|
750
|
+
|
751
|
+
paramG = calcParam(basesOne[0],basesTwo[0],basesThree[0]);
|
752
|
+
contentG = countBases(basesOne[0],basesTwo[0],basesThree[0]) / sequence.length.to_f;
|
753
|
+
posProbG = usePosParam(paramG,"g");
|
754
|
+
contProbG = useContParam(contentG,"g");
|
755
|
+
paramA = calcParam(basesOne[1],basesTwo[1],basesThree[1]);
|
756
|
+
contentA = countBases(basesOne[1],basesTwo[1],basesThree[1]) / sequence.length.to_f;
|
757
|
+
posProbA = usePosParam(paramA,"a");
|
758
|
+
contProbA = useContParam(contentA,"a");
|
759
|
+
paramT = calcParam(basesOne[2],basesTwo[2],basesThree[2]);
|
760
|
+
contentT = countBases(basesOne[2],basesTwo[2],basesThree[2]) / sequence.length.to_f;
|
761
|
+
posProbT = usePosParam(paramT,"t");
|
762
|
+
contProbT = useContParam(contentT,"t");
|
763
|
+
paramC = calcParam(basesOne[3],basesTwo[3],basesThree[3]);
|
764
|
+
contentC = countBases(basesOne[3],basesTwo[3],basesThree[3]) / sequence.length.to_f;
|
765
|
+
posProbC = usePosParam(paramC,"c");
|
766
|
+
contProbC = useContParam(contentC,"c");
|
767
|
+
valueY = posProbG * 0.31 + contProbG * 0.15 + posProbA * 0.26 + contProbA * 0.11 + posProbT * 0.33 + contProbT * 0.14 + posProbC * 0.18 + contProbC * 0.12;
|
768
|
+
valueY = ((valueY*1000.0).round/1000.0);
|
769
|
+
|
770
|
+
# return 'The TestCode value is <b>' + valueY.to_s + '</b>, which indicates that the sequence ' + getConclusion(valueY) + '.';
|
771
|
+
return [valueY.to_s, getConclusion(valueY)]
|
772
|
+
end
|
773
|
+
|
774
|
+
def calcParam (valueOne,valueTwo,valueThree)
|
775
|
+
paramArray = [valueOne,valueTwo,valueThree];
|
776
|
+
paramArray = paramArray.sort#{|a,b| return a-b}#(compareNumbers);
|
777
|
+
paramValue = paramArray[2] / (paramArray[0] + 1.0);
|
778
|
+
# puts paramArray.to_json
|
779
|
+
return paramValue;
|
780
|
+
end
|
781
|
+
|
782
|
+
def countBases (valueOne,valueTwo,valueThree)
|
783
|
+
return valueOne + valueTwo + valueThree;
|
784
|
+
end
|
785
|
+
|
786
|
+
def usePosParam (paramValue,base)
|
787
|
+
arrayOfCodingProb = [];
|
788
|
+
codeProb = 0;
|
789
|
+
if (base == "g")
|
790
|
+
arrayOfCodingProb = [0.08,0.08,0.16,0.27,0.48,0.53,0.64,0.74,0.88,0.90]
|
791
|
+
elsif (base == "a")
|
792
|
+
arrayOfCodingProb = [0.22,0.20,0.34,0.45,0.68,0.58,0.93,0.84,0.68,0.94]
|
793
|
+
elsif (base == "t")
|
794
|
+
arrayOfCodingProb = [0.09,0.09,0.20,0.54,0.44,0.69,0.68,0.91,0.97,0.97]
|
795
|
+
elsif (base == "c")
|
796
|
+
arrayOfCodingProb = [0.23,0.30,0.33,0.51,0.48,0.66,0.81,0.70,0.70,0.80]
|
797
|
+
end
|
798
|
+
|
799
|
+
|
800
|
+
if (paramValue >= 0 and paramValue < 1.1)
|
801
|
+
codeProb = arrayOfCodingProb[0];
|
802
|
+
elsif (paramValue >=1.1 and paramValue < 1.2)
|
803
|
+
codeProb = arrayOfCodingProb[1];
|
804
|
+
elsif (paramValue >=1.2 and paramValue < 1.3)
|
805
|
+
codeProb = arrayOfCodingProb[2];
|
806
|
+
elsif (paramValue >=1.3 and paramValue < 1.4)
|
807
|
+
codeProb = arrayOfCodingProb[3];
|
808
|
+
elsif (paramValue >=1.4 and paramValue < 1.5)
|
809
|
+
codeProb = arrayOfCodingProb[4];
|
810
|
+
elsif (paramValue >=1.5 and paramValue < 1.6)
|
811
|
+
codeProb = arrayOfCodingProb[5];
|
812
|
+
elsif (paramValue >=1.6 and paramValue < 1.7)
|
813
|
+
codeProb = arrayOfCodingProb[6];
|
814
|
+
elsif (paramValue >=1.7 and paramValue < 1.8)
|
815
|
+
codeProb = arrayOfCodingProb[7];
|
816
|
+
elsif (paramValue >=1.8 and paramValue < 1.9)
|
817
|
+
codeProb = arrayOfCodingProb[8];
|
818
|
+
elsif (paramValue >=1.9)
|
819
|
+
codeProb = arrayOfCodingProb[9];
|
820
|
+
end
|
821
|
+
|
822
|
+
return codeProb;
|
823
|
+
end
|
824
|
+
|
825
|
+
def useContParam (paramValue,base)
|
826
|
+
arrayOfCodingProb = [];
|
827
|
+
codeProb = 0;
|
828
|
+
if (base == "g")
|
829
|
+
arrayOfCodingProb = [0.29,0.33,0.41,0.41,0.73,0.64,0.64,0.47,0.54,0.40]
|
830
|
+
elsif (base == "a")
|
831
|
+
arrayOfCodingProb = [0.21,0.81,0.65,0.67,0.49,0.62,0.55,0.44,0.49,0.28]
|
832
|
+
elsif (base == "t")
|
833
|
+
arrayOfCodingProb = [0.58,0.51,0.69,0.56,0.75,0.55,0.40,0.39,0.24,0.28]
|
834
|
+
elsif (base == "c")
|
835
|
+
arrayOfCodingProb = [0.31,0.39,0.44,0.43,0.59,0.59,0.64,0.51,0.64,0.82]
|
836
|
+
end
|
837
|
+
|
838
|
+
if (paramValue >= 0 and paramValue < 0.17)
|
839
|
+
codeProb = arrayOfCodingProb[0];
|
840
|
+
elsif (paramValue >=0.17 and paramValue < 0.19)
|
841
|
+
codeProb = arrayOfCodingProb[1];
|
842
|
+
elsif (paramValue >=0.19 and paramValue < 0.21)
|
843
|
+
codeProb = arrayOfCodingProb[2];
|
844
|
+
elsif (paramValue >=0.21 and paramValue < 0.23)
|
845
|
+
codeProb = arrayOfCodingProb[3];
|
846
|
+
elsif (paramValue >=0.23 and paramValue < 0.25)
|
847
|
+
codeProb = arrayOfCodingProb[4];
|
848
|
+
elsif (paramValue >=0.25 and paramValue < 0.27)
|
849
|
+
codeProb = arrayOfCodingProb[5];
|
850
|
+
elsif (paramValue >=0.27 and paramValue < 0.29)
|
851
|
+
codeProb = arrayOfCodingProb[6];
|
852
|
+
elsif (paramValue >=0.29 and paramValue < 0.31)
|
853
|
+
codeProb = arrayOfCodingProb[7];
|
854
|
+
elsif (paramValue >=0.31 and paramValue < 0.33)
|
855
|
+
codeProb = arrayOfCodingProb[8];
|
856
|
+
elsif (paramValue >=0.33)
|
857
|
+
codeProb = arrayOfCodingProb[9];
|
858
|
+
end
|
859
|
+
|
860
|
+
return codeProb;
|
861
|
+
end
|
862
|
+
|
863
|
+
def getConclusion (testCode_value)
|
864
|
+
codeProb = "";
|
865
|
+
if (testCode_value < 0.74)
|
866
|
+
codeProb = :unknown;
|
867
|
+
elsif (testCode_value >=0.74 and testCode_value < 0.95)
|
868
|
+
codeProb = :putative_coding;
|
869
|
+
elsif (testCode_value >=0.95)
|
870
|
+
codeProb = :coding;
|
871
|
+
end
|
872
|
+
|
873
|
+
return codeProb;
|
874
|
+
end
|
875
|
+
|
876
|
+
|
877
|
+
end
|