full_lengther_next 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/History.txt +4 -0
- data/Manifest.txt +27 -0
- data/PostInstall.txt +6 -0
- data/README.rdoc +147 -0
- data/Rakefile +37 -0
- data/bin/download_fln_dbs.rb +197 -0
- data/bin/full_lengther_next +173 -0
- data/bin/make_user_db.rb +144 -0
- data/lib/full_lengther_next.rb +13 -0
- data/lib/full_lengther_next/classes/common_functions.rb +94 -0
- data/lib/full_lengther_next/classes/fl2_stats.rb +222 -0
- data/lib/full_lengther_next/classes/fl_analysis.rb +688 -0
- data/lib/full_lengther_next/classes/fl_string_utils.rb +139 -0
- data/lib/full_lengther_next/classes/lcs.rb +33 -0
- data/lib/full_lengther_next/classes/my_worker.rb +122 -0
- data/lib/full_lengther_next/classes/my_worker_manager.rb +167 -0
- data/lib/full_lengther_next/classes/orf.rb +32 -0
- data/lib/full_lengther_next/classes/sequence.rb +111 -0
- data/lib/full_lengther_next/classes/test_code.rb +877 -0
- data/lib/full_lengther_next/classes/une_los_hit.rb +287 -0
- data/script/console +10 -0
- data/script/destroy +14 -0
- data/script/generate +14 -0
- data/test/test_full_lengther_next.rb +11 -0
- data/test/test_helper.rb +3 -0
- metadata +150 -0
@@ -0,0 +1,32 @@
|
|
1
|
+
|
2
|
+
class Orf
|
3
|
+
|
4
|
+
attr_accessor :seq,:t_start,:t_end,:frame,:stop_codon,:type,:status,:score, :more_than_one_frame
|
5
|
+
|
6
|
+
def initialize(orf_seq, t_start, t_end, frame, stop_codon, type)
|
7
|
+
@seq=orf_seq
|
8
|
+
@t_start=t_start
|
9
|
+
@t_end=t_end
|
10
|
+
@frame=frame
|
11
|
+
@stop_codon=stop_codon
|
12
|
+
@type=type # :N_terminus,:C_terminus,:Complete,:Internal,:Putative_Complete,:Putative_N_terminus
|
13
|
+
@status = :unknown # :unknown,:putative_coding,:coding
|
14
|
+
@score = 0
|
15
|
+
@more_than_one_frame = false
|
16
|
+
end
|
17
|
+
|
18
|
+
def overlaps?(other_orf)
|
19
|
+
overlap_status = false
|
20
|
+
i1 = self.t_start
|
21
|
+
i2 = other_orf.t_start
|
22
|
+
e1 = self.t_end
|
23
|
+
e2 = other_orf.t_end
|
24
|
+
|
25
|
+
if (e1 > i2) && (e1 < e2)
|
26
|
+
overlap_status = true
|
27
|
+
end
|
28
|
+
|
29
|
+
return overlap_status
|
30
|
+
end
|
31
|
+
|
32
|
+
end
|
@@ -0,0 +1,111 @@
|
|
1
|
+
|
2
|
+
require 'orf'
|
3
|
+
|
4
|
+
class Sequence
|
5
|
+
|
6
|
+
attr_accessor :seq_name,:seq_fasta,:seq_qual,:orfs,:sec_desc
|
7
|
+
|
8
|
+
def initialize(seq_name,seq_fasta,seq_qual='')
|
9
|
+
@seq_name=seq_name
|
10
|
+
@seq_fasta = seq_fasta
|
11
|
+
change_degenerated_nt!
|
12
|
+
@seq_qual = ''
|
13
|
+
@sec_desc = ''
|
14
|
+
@annotations=[]
|
15
|
+
@orfs=[]
|
16
|
+
|
17
|
+
@rejected=false
|
18
|
+
@rejected_message=''
|
19
|
+
|
20
|
+
end
|
21
|
+
|
22
|
+
def add_orf(orf_seq, orf_t_start, orf_t_end, orf_frame, orf_stop_codon, orf_type)
|
23
|
+
orf = Orf.new(orf_seq, orf_t_start, orf_t_end, orf_frame, orf_stop_codon, orf_type)
|
24
|
+
@orfs.push orf
|
25
|
+
|
26
|
+
end
|
27
|
+
|
28
|
+
def rejected?
|
29
|
+
return @rejected
|
30
|
+
end
|
31
|
+
|
32
|
+
def reject!(message='')
|
33
|
+
@rejected=true
|
34
|
+
@rejected_message=message
|
35
|
+
end
|
36
|
+
|
37
|
+
# :complete, :tmp_annotation, :error, :protein, :nucleotide, :alignment, :tcode
|
38
|
+
def get_annotations(annotation_type)
|
39
|
+
return @annotations.select{|a| a[:annotation_type]==annotation_type}
|
40
|
+
end
|
41
|
+
|
42
|
+
def annotate(annotation_type, message='', replace_existing = false)
|
43
|
+
|
44
|
+
if replace_existing
|
45
|
+
@annotations.reverse_each do |annotation|
|
46
|
+
if annotation[:annotation_type]==annotation_type
|
47
|
+
@annotations.delete(annotation)
|
48
|
+
end
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
|
53
|
+
@annotations.push({:annotation_type=>annotation_type,:message=>message})
|
54
|
+
end
|
55
|
+
|
56
|
+
def change_degenerated_nt!
|
57
|
+
|
58
|
+
|
59
|
+
########################################
|
60
|
+
|
61
|
+
tranlaste_hash = {}
|
62
|
+
tranlaste_hash['R']= [['a','g'],0]
|
63
|
+
tranlaste_hash['W']= [['a','t'],0]
|
64
|
+
tranlaste_hash['M']= [['a','c'],0]
|
65
|
+
tranlaste_hash['K']= [['g','t'],0]
|
66
|
+
tranlaste_hash['S']= [['g','c'],0]
|
67
|
+
tranlaste_hash['Y']= [['c','t'],0]
|
68
|
+
tranlaste_hash['H']= [['a','t','c'],0]
|
69
|
+
tranlaste_hash['B']= [['g','t','c'],0]
|
70
|
+
tranlaste_hash['D']= [['g','a','t'],0]
|
71
|
+
tranlaste_hash['V']= [['g','a','c'],0]
|
72
|
+
tranlaste_hash['N']= [['g','a','c','t'],0]
|
73
|
+
|
74
|
+
########################################
|
75
|
+
|
76
|
+
fix_degenerated_fasta!(tranlaste_hash)
|
77
|
+
|
78
|
+
|
79
|
+
end
|
80
|
+
|
81
|
+
def fix_degenerated_fasta!(tranlaste_hash)
|
82
|
+
s = @seq_fasta
|
83
|
+
res = []
|
84
|
+
|
85
|
+
nts_of_a_line = s.split('')
|
86
|
+
|
87
|
+
nts_of_a_line.map{
|
88
|
+
|e|
|
89
|
+
# puts "#{e} "
|
90
|
+
|
91
|
+
if (e =~ /[RWMKSYHBDVN]/)
|
92
|
+
|
93
|
+
# puts "#{e} "
|
94
|
+
tranlaste_hash[e][1] += 1
|
95
|
+
# puts "#{e} #{tranlaste_hash[e][1]}"
|
96
|
+
|
97
|
+
e = tranlaste_hash[e][0][tranlaste_hash[e][1]%tranlaste_hash[e][0].length]
|
98
|
+
|
99
|
+
# puts "#{e}"
|
100
|
+
end
|
101
|
+
|
102
|
+
res.push e
|
103
|
+
|
104
|
+
}
|
105
|
+
|
106
|
+
@seq_fasta=res.compact.join
|
107
|
+
# @seq_fasta='dario'
|
108
|
+
end
|
109
|
+
|
110
|
+
|
111
|
+
end
|
@@ -0,0 +1,877 @@
|
|
1
|
+
|
2
|
+
require 'common_functions'
|
3
|
+
require 'scbi_plot'
|
4
|
+
|
5
|
+
include CommonFunctions
|
6
|
+
|
7
|
+
class TestCode
|
8
|
+
|
9
|
+
def initialize(seq)
|
10
|
+
|
11
|
+
name=''
|
12
|
+
t_code=''
|
13
|
+
status=''
|
14
|
+
ref_start=0
|
15
|
+
ref_end=seq.seq_fasta.length
|
16
|
+
ref_frame=''
|
17
|
+
orf=''
|
18
|
+
protein = ''
|
19
|
+
p_long = 0
|
20
|
+
|
21
|
+
if (seq.seq_fasta.length < 200)
|
22
|
+
ref_name = seq.seq_name
|
23
|
+
ref_code = 0.0
|
24
|
+
ref_frame = 0
|
25
|
+
ref_status = 'unknown'
|
26
|
+
ref_orf = ''
|
27
|
+
ref_msgs = 'Sequence length < 200 nt'
|
28
|
+
|
29
|
+
seq.annotate(:tcode,"#{ref_name}\t#{seq.seq_fasta.length}\t\ttestcode\t#{ref_status}\t#{ref_code}\t\t\t\t\t#{ref_msgs}\t#{ref_frame}\t#{ref_start}\t#{ref_end}\t\t\t\t",true)
|
30
|
+
|
31
|
+
else
|
32
|
+
|
33
|
+
# para probar tescode con toda la secuencia, en lugar de con los ORFs ----------------------------------------------------------------------
|
34
|
+
# sense_strand = seq.seq_fasta.upcase
|
35
|
+
# antisense_strand = sense_strand.complementary_dna
|
36
|
+
# (t_code,t_status) = testCode_exec(sense_strand)
|
37
|
+
# ref_frame = 1
|
38
|
+
# ref_msgs = ''
|
39
|
+
# (as_t_code,as_t_status) = testCode_exec(antisense_strand)
|
40
|
+
# # puts "#{seq.seq_name}: t_code: #{t_code}, t_status: #{t_status}, as_t_code: #{as_t_code}, as_t_status: #{as_t_status}"
|
41
|
+
# seq.annotate(:tcode,"#{seq.seq_name}\t#{seq.seq_fasta.length}\t\ttestcode\t#{t_status}\t#{t_code}\t\t\t\t\t#{ref_msgs}\t#{ref_frame}\t#{ref_start}\t#{ref_end}\t\t\t\t",true)
|
42
|
+
# --------------------------------------------------------------------------------------------------------------------------------
|
43
|
+
|
44
|
+
|
45
|
+
# see add_region filter
|
46
|
+
(name,t_code,status,ref_start,ref_end,ref_frame,orf,ref_msgs,stop_before_start,more_than_one_frame) = t_code(seq)
|
47
|
+
seq.annotate(:tcode,"#{name}\t#{seq.seq_fasta.length}\t\ttestcode\t#{status}\t#{t_code}\t\t\t\t\t#{ref_msgs}\t#{ref_frame}\t#{ref_start}\t#{ref_end}\t\t\t\t",true)
|
48
|
+
|
49
|
+
# if (ref_msgs.nil?)
|
50
|
+
# ref_msgs = ''
|
51
|
+
# end
|
52
|
+
#
|
53
|
+
# if (stop_before_start)
|
54
|
+
# ref_msgs += "There is a STOP codon before ATG. "
|
55
|
+
# end
|
56
|
+
#
|
57
|
+
# if (more_than_one_frame)
|
58
|
+
# ref_msgs += "Possible frame error by an ins/del"
|
59
|
+
# end
|
60
|
+
#
|
61
|
+
# if (status.to_s =~ /^[CN]\_terminus/) || (status == :Internal)
|
62
|
+
# tmp_status = "Putative #{status.to_s}"
|
63
|
+
# else
|
64
|
+
# tmp_status = status.to_s
|
65
|
+
# end
|
66
|
+
#
|
67
|
+
#
|
68
|
+
# if (!orf.nil?) && (!more_than_one_frame)
|
69
|
+
# protein = orf.translate
|
70
|
+
# p_long = protein.length - 3
|
71
|
+
# seq.annotate(:tcode,"#{name}\t#{seq.seq_fasta.length}\t\ttestcode\t#{tmp_status}\t#{t_code}\t\t\t#{p_long}\t\t#{ref_msgs}\t#{ref_frame}\t#{ref_start}\t#{ref_end}\t\t\t\t#{protein}",true)
|
72
|
+
# else
|
73
|
+
# seq.annotate(:tcode,"#{name}\t#{seq.seq_fasta.length}\t\ttestcode\t#{tmp_status}\t#{t_code}\t\t\t\t\t#{ref_msgs}\t#{ref_frame}\t#{ref_start}\t#{ref_end}\t\t\t\t",true)
|
74
|
+
# end
|
75
|
+
|
76
|
+
end
|
77
|
+
end
|
78
|
+
|
79
|
+
def t_code(seq)
|
80
|
+
|
81
|
+
name = seq.seq_name
|
82
|
+
tc_fasta = seq.seq_fasta
|
83
|
+
|
84
|
+
# generamos todos los ORFs de cada secuencia
|
85
|
+
uncomplete_orf_finder(seq)
|
86
|
+
minus_strand = []
|
87
|
+
plus_strand = []
|
88
|
+
|
89
|
+
# puts "**************************************************"
|
90
|
+
# puts "#{name} #{tc_fasta.length}"
|
91
|
+
# ordenamos los ORFs empezando desde 5' a 3' y luego separamos los de cada hebra en un array distinto
|
92
|
+
seq.orfs.sort{|x,y| x.t_start <=> y.t_start }.each do |one_orf|
|
93
|
+
|
94
|
+
(t_code,t_status)=testCode_exec(one_orf.seq)
|
95
|
+
one_orf.status = t_status
|
96
|
+
one_orf.score = t_code
|
97
|
+
|
98
|
+
if (one_orf.frame < 0)
|
99
|
+
minus_strand.push one_orf
|
100
|
+
elsif (one_orf.frame > 0)
|
101
|
+
plus_strand.push one_orf
|
102
|
+
end
|
103
|
+
|
104
|
+
end
|
105
|
+
#----------------------------------- Plus strand
|
106
|
+
if (!plus_strand.empty?)
|
107
|
+
# puts "--------------Plus strand:"
|
108
|
+
best_plus_region = compare_regions(plus_strand,tc_fasta)
|
109
|
+
if (!best_plus_region.nil?)
|
110
|
+
# puts "#{best_plus_region.seq.length}, #{best_plus_region.status}, #{best_plus_region.type}, #{best_plus_region.t_start} - #{best_plus_region.t_end}, #{best_plus_region.frame}"
|
111
|
+
# puts best_plus_region.seq
|
112
|
+
end
|
113
|
+
end
|
114
|
+
#----------------------------------- Minus strand
|
115
|
+
if (!minus_strand.empty?)
|
116
|
+
# puts "--------------Minus strand:"
|
117
|
+
best_minus_region = compare_regions(minus_strand,tc_fasta)
|
118
|
+
if (!best_minus_region.nil?)
|
119
|
+
# puts "#{best_minus_region.seq.length}, #{best_minus_region.status}, #{best_minus_region.type}, #{best_minus_region.t_start} - #{best_minus_region.t_end}, #{best_minus_region.frame}"
|
120
|
+
# puts best_minus_region.seq
|
121
|
+
end
|
122
|
+
end
|
123
|
+
# obtenemos la region codificante mas larga de ambas hebras
|
124
|
+
best_region = nil
|
125
|
+
if (!best_plus_region.nil?)
|
126
|
+
if (!best_minus_region.nil?)
|
127
|
+
if (best_minus_region.seq.length > best_plus_region.seq.length)
|
128
|
+
best_region = best_minus_region
|
129
|
+
else
|
130
|
+
best_region = best_plus_region
|
131
|
+
end
|
132
|
+
else
|
133
|
+
best_region = best_plus_region
|
134
|
+
end
|
135
|
+
elsif (!best_minus_region.nil?)
|
136
|
+
best_region = best_minus_region
|
137
|
+
end
|
138
|
+
# puts "--------------Best region:"
|
139
|
+
# puts " --------------------------------- #{best_region.seq.length}, #{best_region.status}, #{best_region.type}, #{best_region.t_start} - #{best_region.t_end}, #{best_region.frame}"
|
140
|
+
# comprobamos el tipo de ORF segun si tiene un codon de parada antes del atg
|
141
|
+
|
142
|
+
if (!best_region.nil?) && (best_region.seq.length >= 200)
|
143
|
+
if (best_region.type == :Complete) && (!best_region.stop_codon)
|
144
|
+
best_region.type = 'Putative Complete'
|
145
|
+
ref_msgs = 'NO STOP codon before ATG. '
|
146
|
+
elsif (best_region.type == :N_terminus) && (!best_region.stop_codon)
|
147
|
+
ref_msgs = 'NO STOP codon before ATG. '
|
148
|
+
end
|
149
|
+
return [name, best_region.score, best_region.type, best_region.t_start, best_region.t_end, best_region.frame, best_region.seq, ref_msgs, best_region.stop_codon, best_region.more_than_one_frame]
|
150
|
+
else
|
151
|
+
ref_score = 0.0
|
152
|
+
ref_start = 0
|
153
|
+
ref_end = 0
|
154
|
+
ref_frame = 0
|
155
|
+
ref_orf = ''
|
156
|
+
ref_type = 'unknown'
|
157
|
+
ref_msgs = 'Non coding ORF found >= 200 nt '
|
158
|
+
return [name, ref_score, ref_type, ref_start, ref_end, ref_frame, ref_orf, ref_msgs, false, false]
|
159
|
+
end
|
160
|
+
end
|
161
|
+
|
162
|
+
# cuando se unen dos regiones de diferentes frames, nos dice de que tipo es la union, Complete, Internal, N-terminus...
|
163
|
+
def type_fusion(prev_type,one_type)
|
164
|
+
res_type = :Internal
|
165
|
+
if (prev_type == :C_terminus) #-------------- C-terminus
|
166
|
+
if (one_type == :N_terminus)
|
167
|
+
res_type = :Internal
|
168
|
+
elsif (one_type == :Internal)
|
169
|
+
res_type = :Internal
|
170
|
+
elsif (one_type == :Complete)
|
171
|
+
res_type = :C_terminus
|
172
|
+
end
|
173
|
+
elsif (prev_type == :N_terminus) #-------------- N-terminus
|
174
|
+
if (one_type == :C_terminus)
|
175
|
+
res_type = :Complete
|
176
|
+
elsif (one_type == :Internal)
|
177
|
+
res_type = :N_terminus
|
178
|
+
elsif (one_type == :Complete)
|
179
|
+
res_type = :Complete
|
180
|
+
end
|
181
|
+
elsif (prev_type == :Internal) #-------------- Internal
|
182
|
+
if (one_type == :C_terminus)
|
183
|
+
res_type = :C_terminus
|
184
|
+
elsif (one_type == :N_terminus)
|
185
|
+
res_type = :Internal
|
186
|
+
elsif (one_type == :Complete)
|
187
|
+
res_type = :C_terminus
|
188
|
+
end
|
189
|
+
elsif (prev_type == :Complete) #-------------- Complete
|
190
|
+
if (one_type == :C_terminus)
|
191
|
+
res_type = :Complete
|
192
|
+
elsif (one_type == :N_terminus)
|
193
|
+
res_type = :N_terminus
|
194
|
+
elsif (one_type == :Internal)
|
195
|
+
res_type = :N_terminus
|
196
|
+
end
|
197
|
+
end
|
198
|
+
return res_type
|
199
|
+
end
|
200
|
+
|
201
|
+
# para escoger la region codificante mas grande, incluso solapando verios frames
|
202
|
+
def compare_regions(this_strand,tc_fasta)
|
203
|
+
|
204
|
+
# select the largest ORF without frame repair-------------------------------- hay que comentar todo el bloque de arriba
|
205
|
+
best_orf = nil
|
206
|
+
best_orf = this_strand.sort{|x,y| y.seq.length <=> x.seq.length }[0]
|
207
|
+
|
208
|
+
if !best_orf.nil?
|
209
|
+
best_orf.type = best_orf.status
|
210
|
+
end
|
211
|
+
|
212
|
+
return best_orf
|
213
|
+
|
214
|
+
end
|
215
|
+
|
216
|
+
# to add regions over a determinated size
|
217
|
+
def add_region(orf_seq, orf_t_start, orf_t_end, orf_frame, orf_stop_codon, orf_type,seq)
|
218
|
+
|
219
|
+
if (orf_seq.length >= 200)
|
220
|
+
# puts "#{seq.seq_name}, #{orf_t_start} - #{orf_t_end}, #{orf_frame}, #{orf_type}"
|
221
|
+
seq.add_orf(orf_seq, orf_t_start, orf_t_end, orf_frame, orf_stop_codon, orf_type)
|
222
|
+
end
|
223
|
+
|
224
|
+
end
|
225
|
+
|
226
|
+
# se buscan las regiones Complete, N-terminus, C-terminus e Internal en la secuencia de cada frame
|
227
|
+
def generate_uncomplete_orf(a,frame,seq)
|
228
|
+
|
229
|
+
my_atg = true
|
230
|
+
atg_codon = false
|
231
|
+
stop_codon = false
|
232
|
+
any_stop = false
|
233
|
+
|
234
|
+
orf =''
|
235
|
+
t_start = 0
|
236
|
+
t_end = 0
|
237
|
+
|
238
|
+
a.each do |e|
|
239
|
+
t_end += 3
|
240
|
+
orf += e
|
241
|
+
|
242
|
+
if (e == 'ATG') && (!atg_codon)
|
243
|
+
atg_codon = true
|
244
|
+
stop_codon = false
|
245
|
+
t_start = t_end - 3
|
246
|
+
elsif (e == 'TAG') || (e == 'TGA') || (e == 'TAA')
|
247
|
+
orf_tmp = orf[t_start..t_end]
|
248
|
+
if (any_stop)
|
249
|
+
# case 1, complete orf
|
250
|
+
if (atg_codon)
|
251
|
+
orf_tmp = orf[t_start..t_end]
|
252
|
+
add_region(orf_tmp, t_start, t_end, frame, any_stop, :Complete,seq)
|
253
|
+
end
|
254
|
+
else
|
255
|
+
# case 2, C_terminus
|
256
|
+
if (my_atg)
|
257
|
+
orf_tmp = orf[0..t_end]
|
258
|
+
add_region(orf_tmp, 0, t_end, frame, any_stop, :C_terminus,seq)
|
259
|
+
# case 3, putative complete, complete without stop codon before atg
|
260
|
+
if (atg_codon)
|
261
|
+
orf_tmp = orf[t_start..t_end]
|
262
|
+
add_region(orf_tmp, t_start, t_end, frame, any_stop, :Complete, seq)
|
263
|
+
end
|
264
|
+
end
|
265
|
+
end
|
266
|
+
|
267
|
+
stop_codon = true
|
268
|
+
any_stop = true
|
269
|
+
my_atg = false
|
270
|
+
atg_codon = false
|
271
|
+
t_start += 3
|
272
|
+
end
|
273
|
+
end
|
274
|
+
|
275
|
+
# case 4, N_terminus and case 6, putative N_terminus
|
276
|
+
if (atg_codon) && (!stop_codon)
|
277
|
+
orf_tmp = orf[t_start..t_end]
|
278
|
+
add_region(orf_tmp, t_start, t_end, frame, any_stop, :N_terminus,seq)
|
279
|
+
end
|
280
|
+
|
281
|
+
# case 5, internal
|
282
|
+
if (my_atg) && (!any_stop)
|
283
|
+
orf_tmp = orf[0..t_end]
|
284
|
+
add_region(orf_tmp, 0, t_end, frame, any_stop, :Internal,seq)
|
285
|
+
end
|
286
|
+
|
287
|
+
end
|
288
|
+
|
289
|
+
# recorre cada uno de los frames y los pasa a generate_uncomplete_orf
|
290
|
+
def uncomplete_orf_finder(seq)
|
291
|
+
|
292
|
+
s = seq.seq_fasta.upcase
|
293
|
+
f1 = s.split('').each_slice(3).map{|e| e.join}
|
294
|
+
generate_uncomplete_orf(f1,1,seq)
|
295
|
+
|
296
|
+
s.sub!(/^./,'')
|
297
|
+
f2 = s.split('').each_slice(3).map{|e| e.join}
|
298
|
+
generate_uncomplete_orf(f2,2,seq)
|
299
|
+
|
300
|
+
s.sub!(/^./,'')
|
301
|
+
f3 = s.split('').each_slice(3).map{|e| e.join}
|
302
|
+
generate_uncomplete_orf(f3,3,seq)
|
303
|
+
|
304
|
+
# vamos a por los ORFs de la cadena complementaria
|
305
|
+
s = seq.seq_fasta.upcase
|
306
|
+
s = s.complementary_dna
|
307
|
+
|
308
|
+
f4 = s.split('').each_slice(3).map{|e| e.join}
|
309
|
+
generate_uncomplete_orf(f4,-1,seq)
|
310
|
+
|
311
|
+
s.sub!(/^./,'')
|
312
|
+
f5 = s.split('').each_slice(3).map{|e| e.join}
|
313
|
+
generate_uncomplete_orf(f5,-2,seq)
|
314
|
+
|
315
|
+
s.sub!(/^./,'')
|
316
|
+
f6 = s.split('').each_slice(3).map{|e| e.join}
|
317
|
+
generate_uncomplete_orf(f6,-3,seq)
|
318
|
+
|
319
|
+
end
|
320
|
+
|
321
|
+
|
322
|
+
|
323
|
+
|
324
|
+
|
325
|
+
|
326
|
+
|
327
|
+
|
328
|
+
# | = ATG
|
329
|
+
# * = STOP
|
330
|
+
#
|
331
|
+
# -------*---|>>>>>>>>>>>>>> 1 >>>>>>>>>>>>*-------------
|
332
|
+
# >>>>>>>>>> 2 >>>>>>>>>>>>>*----------------------------
|
333
|
+
# ---|>>>>>>>> 3 >>>>>>>>>>>>*---------------------------
|
334
|
+
# -------------------*-----------------|>>>>>>>>>> 4 >>>>
|
335
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>> 5 >>>>>>>>>>>>>>>>>>>>>>>>>>
|
336
|
+
#--------------------------- 6 ----------|>>>>>>>>>>>>>>>
|
337
|
+
#
|
338
|
+
# 1 complete orf, with stop codon before atg
|
339
|
+
# 2 C-terminus
|
340
|
+
# 3 putative complete, complete without stop codon before atg
|
341
|
+
# 4 N_terminus, with stop codon before atg
|
342
|
+
# 5 internal
|
343
|
+
# 6 putative N_terminus, N_terminus without stop codon before atg
|
344
|
+
|
345
|
+
# no se usa
|
346
|
+
def orf_fusion(orfs_array,tc_fasta,name,ref_status,ref_code,ref_start,ref_end,ref_frame,ref_orf,ref_msgs)
|
347
|
+
|
348
|
+
lower_start = 9999
|
349
|
+
higher_end = 0
|
350
|
+
new_orf = false
|
351
|
+
ref_name = name
|
352
|
+
|
353
|
+
# vamos a coger el mejor de referencia (el coding mas largo)
|
354
|
+
# y vamos a poner en el warning los coding que estan en el mismo sentido y no estan contenidos en el mejor
|
355
|
+
orfs_array.sort! {|orf1,orf2| orf1[1] <=> orf2[1]}
|
356
|
+
|
357
|
+
tmp_orf = ref_orf
|
358
|
+
tmp_start = ref_start
|
359
|
+
tmp_end = ref_end
|
360
|
+
tmp_frame = ref_frame
|
361
|
+
tmp_msg = ''
|
362
|
+
|
363
|
+
# puts "\n\n#{name} ---- tmp_frame: #{tmp_frame} ------------------------ \n\n"
|
364
|
+
|
365
|
+
orfs_array.each do |orf|
|
366
|
+
|
367
|
+
(orf[1],orf[2]) = corrige_frame(orf[3],orf[1],orf[2])
|
368
|
+
# (orf[1],orf[2]) = $an.corrige_frame(orf[3],orf[1],orf[2])
|
369
|
+
|
370
|
+
# puts "*** name: #{name}, orf[1]: #{orf[1]}, orf[2]: #{orf[2]}, orf[3]: #{orf[3]} ref_start: #{ref_start}, ref_end: #{ref_end}, ref_frame: #{ref_frame} ***\n\n"
|
371
|
+
|
372
|
+
if (orf[0] != tmp_orf)
|
373
|
+
|
374
|
+
if ((tmp_end >= orf[1]) && (tmp_end <= orf[2])) || ((tmp_start >= orf[1]) && (tmp_start <= orf[2])) # los ORFs solapan
|
375
|
+
|
376
|
+
# puts "SOLAPAN frame: #{orf[3]} tmp_start: #{tmp_start} tmp_end: #{tmp_end} orf_start: #{orf[1]} orf_end: #{orf[2]}"
|
377
|
+
if (tmp_frame > 0)
|
378
|
+
tmp_msg = ", overlapping coding region (#{orf[1]},#{orf[2]})"
|
379
|
+
elsif (tmp_frame < 0)
|
380
|
+
tmp_msg = ", overlapping coding region (-#{orf[1]},-#{orf[2]})"
|
381
|
+
end
|
382
|
+
new_orf = true
|
383
|
+
|
384
|
+
elsif (tmp_end < orf[1]) || (tmp_start > orf[2]) # los ORFs estan separados
|
385
|
+
|
386
|
+
# puts "#{name} frame: #{orf[3]} SEPARADOS --> tmp_start: #{tmp_start} tmp_end: #{tmp_end} orf_start: #{orf[1]} orf_end: #{orf[2]}"
|
387
|
+
if (tmp_frame > 0)
|
388
|
+
tmp_msg = ", other coding region (#{orf[1]},#{orf[2]})"
|
389
|
+
elsif (tmp_frame < 0)
|
390
|
+
tmp_msg = ", other coding region (-#{orf[1]},-#{orf[2]})"
|
391
|
+
end
|
392
|
+
new_orf = true
|
393
|
+
|
394
|
+
end
|
395
|
+
|
396
|
+
if (new_orf == true)
|
397
|
+
|
398
|
+
if (orf[1] < lower_start)
|
399
|
+
lower_start = orf[1]
|
400
|
+
end
|
401
|
+
|
402
|
+
# if (orf[2] > higher_end)
|
403
|
+
# end
|
404
|
+
|
405
|
+
(tmp_code,tmp_status)=testCode_exec(tc_fasta[lower_start-1..higher_end-1])
|
406
|
+
|
407
|
+
if (tmp_status != 'unknown')
|
408
|
+
# puts "#{name} ----------------------------------- FUSION!!!!!!!!!!!!!!!!\n\n"
|
409
|
+
# tenemos varios ORFs q son codificantes al unirlos
|
410
|
+
ref_msgs += tmp_msg
|
411
|
+
new_orf = false
|
412
|
+
else
|
413
|
+
# puts "#{name} ------------------------------------NO se unen\n\n"
|
414
|
+
end
|
415
|
+
end
|
416
|
+
end
|
417
|
+
end
|
418
|
+
return[ref_status,ref_code,ref_name,ref_start,ref_end,ref_frame,ref_orf,ref_msgs]
|
419
|
+
end
|
420
|
+
# no se usa
|
421
|
+
def t_code_old(seq)
|
422
|
+
|
423
|
+
ref_code = 0.0
|
424
|
+
ref_name = ''
|
425
|
+
ref_start = 0
|
426
|
+
ref_end = 0
|
427
|
+
ref_frame = 0
|
428
|
+
ref_status = ''
|
429
|
+
ref_orf = ''
|
430
|
+
ref_msgs = ''
|
431
|
+
|
432
|
+
name = seq.seq_name
|
433
|
+
tc_fasta = seq.seq_fasta
|
434
|
+
|
435
|
+
# generamos todos los ORFs mayores de 200pb de cada secuencia, en fl_string_utils
|
436
|
+
orfs_array = tc_fasta.orf_finder
|
437
|
+
|
438
|
+
if (orfs_array[0].nil?)
|
439
|
+
ref_name = name
|
440
|
+
ref_code = 0.0
|
441
|
+
ref_start = 0
|
442
|
+
ref_end = 0
|
443
|
+
ref_frame = 0
|
444
|
+
ref_status = 'unknown'
|
445
|
+
ref_orf = ''
|
446
|
+
ref_msgs = 'ORF length < 200 nt'
|
447
|
+
# ref_msgs = 'Your sequence has not an ORF longer than 200 nt'
|
448
|
+
|
449
|
+
else
|
450
|
+
one_good_orf_minus = false
|
451
|
+
more_than_one_minus = false
|
452
|
+
one_good_orf_plus = false
|
453
|
+
more_than_one_plus = false
|
454
|
+
|
455
|
+
good_orfs_minus = []
|
456
|
+
good_orfs_plus = []
|
457
|
+
|
458
|
+
one_coding = false
|
459
|
+
one_putative = false
|
460
|
+
|
461
|
+
# orfs_array.sort! {|orf1,orf2| (orf1[2] - orf1[1]) <=> (orf2[2] - orf2[1])}
|
462
|
+
orfs_array.each do |orf|
|
463
|
+
|
464
|
+
# long = orf.length - 1
|
465
|
+
if (orf[0])
|
466
|
+
# if (long >= 200)
|
467
|
+
(t_code,t_status)=testCode_exec(orf[0])
|
468
|
+
# puts "name: #{name},t_status: #{t_status}, t_code: #{t_code}, stop_codon: #{orf[4]}, length: #{tc_fasta.length}, start: #{orf[1]}, end: #{orf[2]}, frame: #{orf[3]}\n\n"
|
469
|
+
|
470
|
+
if (t_status != 'unknown')
|
471
|
+
if (orf[3].to_i < 0)
|
472
|
+
if (one_good_orf_minus == true)
|
473
|
+
more_than_one_minus = true
|
474
|
+
orf.push t_code
|
475
|
+
orf.push t_status
|
476
|
+
good_orfs_minus.push orf
|
477
|
+
else
|
478
|
+
one_good_orf_minus = true
|
479
|
+
orf.push t_code #orf[5]
|
480
|
+
orf.push t_status #orf[6]
|
481
|
+
good_orfs_minus.push orf
|
482
|
+
end
|
483
|
+
elsif (orf[3].to_i > 0)
|
484
|
+
if (one_good_orf_plus == true)
|
485
|
+
more_than_one_plus = true
|
486
|
+
orf.push t_code
|
487
|
+
orf.push t_status
|
488
|
+
good_orfs_plus.push orf
|
489
|
+
else
|
490
|
+
one_good_orf_plus = true
|
491
|
+
orf.push t_code #orf[5]
|
492
|
+
orf.push t_status #orf[6]
|
493
|
+
good_orfs_plus.push orf
|
494
|
+
end
|
495
|
+
end
|
496
|
+
end
|
497
|
+
|
498
|
+
# if (t_code.to_f > ref_code) # cogemos el de mejor testcode
|
499
|
+
# puts "name: #{name}, orf[0].length: #{orf[0].length}, ref_orf.length: #{ref_orf.length}, t_status: #{t_status}, one_coding: #{one_coding}\n\n"
|
500
|
+
if (orf[0].length > ref_orf.length) and (t_status == 'coding') # cogemos el mayor de los coding
|
501
|
+
# puts "compleeeeeeeeeeeeeeetttttttttttaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"
|
502
|
+
ref_code = t_code.to_f
|
503
|
+
ref_name = name
|
504
|
+
ref_orf = orf[0]
|
505
|
+
ref_start = orf[1]
|
506
|
+
ref_end = orf[2]
|
507
|
+
ref_frame = orf[3]
|
508
|
+
|
509
|
+
one_coding = true
|
510
|
+
(ref_status,ref_msgs) = checking_beginning(orf[4],t_status,orf[1],tc_fasta,orf[2])
|
511
|
+
|
512
|
+
elsif (one_coding == false) and (orf[0].length > ref_orf.length) and (t_status == 'putative_coding')
|
513
|
+
# puts "putaaaaaaaaaaaaaaaaaaativeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeee"
|
514
|
+
ref_code = t_code.to_f
|
515
|
+
ref_name = name
|
516
|
+
ref_orf = orf[0]
|
517
|
+
ref_start = orf[1]
|
518
|
+
ref_end = orf[2]
|
519
|
+
ref_frame = orf[3]
|
520
|
+
|
521
|
+
one_putative = true
|
522
|
+
(ref_status,ref_msgs) = checking_beginning(orf[4],t_status,orf[1],tc_fasta,orf[2])
|
523
|
+
|
524
|
+
elsif (one_coding == false) and (one_putative == false)
|
525
|
+
# puts "unknoooooooooooooooooooooooooooooooooooooooooooooooooooooooooooown"
|
526
|
+
ref_status = t_status
|
527
|
+
ref_code = t_code.to_f
|
528
|
+
ref_name = name
|
529
|
+
ref_orf = orf[0]
|
530
|
+
ref_start = orf[1]
|
531
|
+
ref_end = orf[2]
|
532
|
+
ref_frame = orf[3]
|
533
|
+
|
534
|
+
ref_msgs = 'Bad testcode score'
|
535
|
+
# ref_msgs = 'test code did not find a coding region in your sequence'
|
536
|
+
end
|
537
|
+
# puts "**name: #{name},ref_status: #{ref_status}, ref_code: #{ref_code}, length: #{tc_fasta.length}, ref_start: #{ref_start}, ref_end: #{ref_end}, ref_frame: #{ref_frame}\n\n"
|
538
|
+
# puts "t_code: #{t_code}, t_status: #{t_status}"
|
539
|
+
end
|
540
|
+
end
|
541
|
+
|
542
|
+
# vamos a preparar el mejor ORF y devolverlo
|
543
|
+
(ref_start,ref_end) = corrige_frame(ref_frame,ref_start,ref_end)
|
544
|
+
# puts "**name: #{name}, ref_start: #{ref_start}, ref_end: #{ref_end}, ref_frame: #{ref_frame}\n\n"
|
545
|
+
|
546
|
+
# si encontramos más de un orf valido
|
547
|
+
if (more_than_one_plus) or (more_than_one_minus)
|
548
|
+
if (ref_frame > 0)
|
549
|
+
if (more_than_one_plus)
|
550
|
+
(ref_status,ref_code,ref_name,ref_start,ref_end,ref_frame,ref_orf,ref_msgs) = orf_fusion(good_orfs_plus,tc_fasta,name,ref_status,ref_code,ref_start,ref_end,ref_frame,ref_orf,ref_msgs)
|
551
|
+
# if (ref_frame == 0)
|
552
|
+
# ref_frame = 7
|
553
|
+
# end
|
554
|
+
# puts "merging ORFs with orf_fusion!!!!"
|
555
|
+
end
|
556
|
+
elsif (ref_frame < 0)
|
557
|
+
if (more_than_one_minus)
|
558
|
+
(ref_status,ref_code,ref_name,ref_start,ref_end,ref_frame,ref_orf,ref_msgs) = orf_fusion(good_orfs_minus,tc_fasta,name,ref_status,ref_code,ref_start,ref_end,ref_frame,ref_orf,ref_msgs)
|
559
|
+
# if (ref_frame == 0)
|
560
|
+
# ref_frame = -7
|
561
|
+
# end
|
562
|
+
# puts "merging ORFs with orf_fusion!!!!"
|
563
|
+
end
|
564
|
+
end
|
565
|
+
end
|
566
|
+
|
567
|
+
# puts "name: #{name},t_status: #{ref_status}, t_code: #{ref_code}, frame: #{ref_frame},start: #{ref_start}, end: #{ref_end}\n\n"
|
568
|
+
|
569
|
+
if (ref_frame < 0)
|
570
|
+
(kk1,kk2,ref_start2,ref_end2) = reverse_seq(tc_fasta,ref_frame,ref_start,ref_end)
|
571
|
+
|
572
|
+
ref_start_ok = "#{ref_start2} (-#{ref_start})"
|
573
|
+
ref_end_ok = "#{ref_end2} (-#{ref_end})"
|
574
|
+
else
|
575
|
+
ref_start_ok = ref_start
|
576
|
+
ref_end_ok = ref_end
|
577
|
+
end
|
578
|
+
end
|
579
|
+
|
580
|
+
return [ref_name,ref_code,ref_status,ref_start_ok,ref_end_ok,ref_frame,ref_orf,ref_msgs]
|
581
|
+
|
582
|
+
end
|
583
|
+
# no se usa
|
584
|
+
# gnuplot must be installed
|
585
|
+
def window_walking(seq)
|
586
|
+
|
587
|
+
tcode_array = []
|
588
|
+
y1=[]
|
589
|
+
y2=[]
|
590
|
+
|
591
|
+
s = seq.seq_fasta.upcase
|
592
|
+
(s.length-200).times do |i|
|
593
|
+
(t_code,t_status)=testCode_exec(s[i..i+199])
|
594
|
+
y1.push t_code
|
595
|
+
end
|
596
|
+
|
597
|
+
tcode_array_rc = []
|
598
|
+
|
599
|
+
src = s.complementary_dna
|
600
|
+
(src.length-200).times do |i|
|
601
|
+
(t_code,t_status)=testCode_exec(src[i..i+199])
|
602
|
+
# puts "#{i}-#{i+199}"
|
603
|
+
# puts t_status
|
604
|
+
# puts t_code
|
605
|
+
puts src[i..i+199]
|
606
|
+
y2.push t_code
|
607
|
+
end
|
608
|
+
|
609
|
+
# Create lines plot
|
610
|
+
p=ScbiPlot::Lines.new('lines.png','title')
|
611
|
+
x=(1..src.length-200).entries
|
612
|
+
p.add_x(x)
|
613
|
+
|
614
|
+
# puts "x_length: #{src.length-200} #{x}"
|
615
|
+
# puts "x: #{x.length}, y: #{y1.length}"
|
616
|
+
p.add_series('serie0', y1)
|
617
|
+
|
618
|
+
p.do_graph
|
619
|
+
|
620
|
+
end
|
621
|
+
# no se usa
|
622
|
+
# para escoger la region codificante mas grande, incluso solapando verios frames
|
623
|
+
def compare_regions_old(this_strand,tc_fasta)
|
624
|
+
|
625
|
+
# array_regions = []
|
626
|
+
# array_regions.push this_strand.first.dup
|
627
|
+
#
|
628
|
+
# this_strand.each do |one_orf|
|
629
|
+
#
|
630
|
+
# prev_orf = array_regions.last.dup # taking last orf before add actual orf
|
631
|
+
# array_regions.push one_orf.dup # add actual orf, el primer orf me va a salir duplicado
|
632
|
+
#
|
633
|
+
# if prev_orf.overlaps?(one_orf)
|
634
|
+
# prev_orf.seq = tc_fasta[prev_orf.t_start..one_orf.t_end]
|
635
|
+
#
|
636
|
+
# (t_code,t_status)=testCode_exec(prev_orf.seq)
|
637
|
+
#
|
638
|
+
# prev_orf.score = t_code
|
639
|
+
# prev_orf.status = t_status
|
640
|
+
#
|
641
|
+
# #t_start, frame and stop_codon are the same
|
642
|
+
# prev_orf.t_end = one_orf.t_end
|
643
|
+
# if (prev_orf.type != one_orf.type)
|
644
|
+
# prev_orf.type = type_fusion(prev_orf.type,one_orf.type)
|
645
|
+
# end
|
646
|
+
# prev_orf.more_than_one_frame = true
|
647
|
+
#
|
648
|
+
# # puts "overlaps:"
|
649
|
+
# # puts "#{one_orf.t_start} - #{one_orf.t_end}"
|
650
|
+
# # puts "#{array_regions.last.t_start} - #{array_regions.last.t_end}"
|
651
|
+
#
|
652
|
+
# array_regions.push prev_orf # add overlapped orf,
|
653
|
+
# end
|
654
|
+
#
|
655
|
+
# end
|
656
|
+
#
|
657
|
+
# # select the largest ORF without frame repair-------------------------------- hay que comentar todo el bloque de arriba
|
658
|
+
# # best_orf = nil
|
659
|
+
# # best_orf = this_strand.sort{|x,y| y.seq.length <=> x.seq.length }[0]
|
660
|
+
# # ---------------------------------------------------------------------------
|
661
|
+
#
|
662
|
+
# # select the largest ORF ----------------------------------------------------
|
663
|
+
# # best_orf = nil
|
664
|
+
# # best_orf = array_regions.sort{|x,y| y.seq.length <=> x.seq.length }[0]
|
665
|
+
# # if !best_orf.nil?
|
666
|
+
# # puts "best_orf.status: #{best_orf.status}, best_orf.type: #{best_orf.type}"
|
667
|
+
# # end
|
668
|
+
#
|
669
|
+
#
|
670
|
+
# # ---------------------------------------------------------------------------
|
671
|
+
#
|
672
|
+
# # select the largest coding ORF ---------------------------------------------
|
673
|
+
# best_orf = nil
|
674
|
+
# array_regions.sort{|x,y| y.seq.length <=> x.seq.length }.each do |one_orf|
|
675
|
+
# # puts "#{one_orf.status}, #{one_orf.type}"
|
676
|
+
# if (one_orf.status == :coding)
|
677
|
+
# best_orf = one_orf
|
678
|
+
# break
|
679
|
+
# end
|
680
|
+
# end
|
681
|
+
# # ---------------------------------------------------------------------------
|
682
|
+
#
|
683
|
+
# # puts "\n\n\nstart"
|
684
|
+
# # array_regions.sort{|x,y| y.seq.length <=> x.seq.length }.each do |one_orf|
|
685
|
+
# # puts one_orf.seq.length
|
686
|
+
# # end
|
687
|
+
# # puts "end\n\n\n"
|
688
|
+
# # puts "best_orf: #{best_orf.seq.length}"
|
689
|
+
#
|
690
|
+
# # if !best_orf.nil?
|
691
|
+
# # best_orf.type = best_orf.status
|
692
|
+
# # end
|
693
|
+
#
|
694
|
+
# return best_orf
|
695
|
+
|
696
|
+
end
|
697
|
+
|
698
|
+
###### test code program functions #########
|
699
|
+
|
700
|
+
def testCode_exec (sequence)
|
701
|
+
|
702
|
+
sequence.downcase!
|
703
|
+
basesOne = [0,0,0,0];
|
704
|
+
basesTwo = [0,0,0,0];
|
705
|
+
basesThree = [0,0,0,0];
|
706
|
+
|
707
|
+
#for (j = 0; j < sequence.length; j = j + 3)
|
708
|
+
|
709
|
+
0.step(sequence.length-1,3) do |j|
|
710
|
+
if (sequence[j].chr == "g")
|
711
|
+
basesOne[0] = basesOne[0] + 1;
|
712
|
+
elsif (sequence[j].chr == "a")
|
713
|
+
basesOne[1] = basesOne[1] + 1;
|
714
|
+
elsif (sequence[j].chr == "t")
|
715
|
+
basesOne[2] = basesOne[2] + 1;
|
716
|
+
elsif (sequence[j].chr == "c")
|
717
|
+
basesOne[3] = basesOne[3] + 1;
|
718
|
+
else
|
719
|
+
end
|
720
|
+
end
|
721
|
+
|
722
|
+
#for (j = 1; j < sequence.length; j = j + 3)
|
723
|
+
1.step(sequence.length-1,3) do |j|
|
724
|
+
if (sequence[j].chr == "g")
|
725
|
+
basesTwo[0] = basesTwo[0] + 1;
|
726
|
+
elsif (sequence[j].chr == "a")
|
727
|
+
basesTwo[1] = basesTwo[1] + 1;
|
728
|
+
elsif (sequence[j].chr == "t")
|
729
|
+
basesTwo[2] = basesTwo[2] + 1;
|
730
|
+
elsif (sequence[j].chr == "c")
|
731
|
+
basesTwo[3] = basesTwo[3] + 1;
|
732
|
+
else
|
733
|
+
end
|
734
|
+
end
|
735
|
+
|
736
|
+
|
737
|
+
#for (j = 2; j < sequence.length; j = j + 3)
|
738
|
+
2.step(sequence.length-1,3) do |j|
|
739
|
+
if (sequence[j].chr == "g")
|
740
|
+
basesThree[0] = basesThree[0] + 1;
|
741
|
+
elsif (sequence[j].chr == "a")
|
742
|
+
basesThree[1] = basesThree[1] + 1;
|
743
|
+
elsif (sequence[j].chr == "t")
|
744
|
+
basesThree[2] = basesThree[2] + 1;
|
745
|
+
elsif (sequence[j].chr == "c")
|
746
|
+
basesThree[3] = basesThree[3] + 1;
|
747
|
+
else
|
748
|
+
end
|
749
|
+
end
|
750
|
+
|
751
|
+
paramG = calcParam(basesOne[0],basesTwo[0],basesThree[0]);
|
752
|
+
contentG = countBases(basesOne[0],basesTwo[0],basesThree[0]) / sequence.length.to_f;
|
753
|
+
posProbG = usePosParam(paramG,"g");
|
754
|
+
contProbG = useContParam(contentG,"g");
|
755
|
+
paramA = calcParam(basesOne[1],basesTwo[1],basesThree[1]);
|
756
|
+
contentA = countBases(basesOne[1],basesTwo[1],basesThree[1]) / sequence.length.to_f;
|
757
|
+
posProbA = usePosParam(paramA,"a");
|
758
|
+
contProbA = useContParam(contentA,"a");
|
759
|
+
paramT = calcParam(basesOne[2],basesTwo[2],basesThree[2]);
|
760
|
+
contentT = countBases(basesOne[2],basesTwo[2],basesThree[2]) / sequence.length.to_f;
|
761
|
+
posProbT = usePosParam(paramT,"t");
|
762
|
+
contProbT = useContParam(contentT,"t");
|
763
|
+
paramC = calcParam(basesOne[3],basesTwo[3],basesThree[3]);
|
764
|
+
contentC = countBases(basesOne[3],basesTwo[3],basesThree[3]) / sequence.length.to_f;
|
765
|
+
posProbC = usePosParam(paramC,"c");
|
766
|
+
contProbC = useContParam(contentC,"c");
|
767
|
+
valueY = posProbG * 0.31 + contProbG * 0.15 + posProbA * 0.26 + contProbA * 0.11 + posProbT * 0.33 + contProbT * 0.14 + posProbC * 0.18 + contProbC * 0.12;
|
768
|
+
valueY = ((valueY*1000.0).round/1000.0);
|
769
|
+
|
770
|
+
# return 'The TestCode value is <b>' + valueY.to_s + '</b>, which indicates that the sequence ' + getConclusion(valueY) + '.';
|
771
|
+
return [valueY.to_s, getConclusion(valueY)]
|
772
|
+
end
|
773
|
+
|
774
|
+
def calcParam (valueOne,valueTwo,valueThree)
|
775
|
+
paramArray = [valueOne,valueTwo,valueThree];
|
776
|
+
paramArray = paramArray.sort#{|a,b| return a-b}#(compareNumbers);
|
777
|
+
paramValue = paramArray[2] / (paramArray[0] + 1.0);
|
778
|
+
# puts paramArray.to_json
|
779
|
+
return paramValue;
|
780
|
+
end
|
781
|
+
|
782
|
+
def countBases (valueOne,valueTwo,valueThree)
|
783
|
+
return valueOne + valueTwo + valueThree;
|
784
|
+
end
|
785
|
+
|
786
|
+
def usePosParam (paramValue,base)
|
787
|
+
arrayOfCodingProb = [];
|
788
|
+
codeProb = 0;
|
789
|
+
if (base == "g")
|
790
|
+
arrayOfCodingProb = [0.08,0.08,0.16,0.27,0.48,0.53,0.64,0.74,0.88,0.90]
|
791
|
+
elsif (base == "a")
|
792
|
+
arrayOfCodingProb = [0.22,0.20,0.34,0.45,0.68,0.58,0.93,0.84,0.68,0.94]
|
793
|
+
elsif (base == "t")
|
794
|
+
arrayOfCodingProb = [0.09,0.09,0.20,0.54,0.44,0.69,0.68,0.91,0.97,0.97]
|
795
|
+
elsif (base == "c")
|
796
|
+
arrayOfCodingProb = [0.23,0.30,0.33,0.51,0.48,0.66,0.81,0.70,0.70,0.80]
|
797
|
+
end
|
798
|
+
|
799
|
+
|
800
|
+
if (paramValue >= 0 and paramValue < 1.1)
|
801
|
+
codeProb = arrayOfCodingProb[0];
|
802
|
+
elsif (paramValue >=1.1 and paramValue < 1.2)
|
803
|
+
codeProb = arrayOfCodingProb[1];
|
804
|
+
elsif (paramValue >=1.2 and paramValue < 1.3)
|
805
|
+
codeProb = arrayOfCodingProb[2];
|
806
|
+
elsif (paramValue >=1.3 and paramValue < 1.4)
|
807
|
+
codeProb = arrayOfCodingProb[3];
|
808
|
+
elsif (paramValue >=1.4 and paramValue < 1.5)
|
809
|
+
codeProb = arrayOfCodingProb[4];
|
810
|
+
elsif (paramValue >=1.5 and paramValue < 1.6)
|
811
|
+
codeProb = arrayOfCodingProb[5];
|
812
|
+
elsif (paramValue >=1.6 and paramValue < 1.7)
|
813
|
+
codeProb = arrayOfCodingProb[6];
|
814
|
+
elsif (paramValue >=1.7 and paramValue < 1.8)
|
815
|
+
codeProb = arrayOfCodingProb[7];
|
816
|
+
elsif (paramValue >=1.8 and paramValue < 1.9)
|
817
|
+
codeProb = arrayOfCodingProb[8];
|
818
|
+
elsif (paramValue >=1.9)
|
819
|
+
codeProb = arrayOfCodingProb[9];
|
820
|
+
end
|
821
|
+
|
822
|
+
return codeProb;
|
823
|
+
end
|
824
|
+
|
825
|
+
def useContParam (paramValue,base)
|
826
|
+
arrayOfCodingProb = [];
|
827
|
+
codeProb = 0;
|
828
|
+
if (base == "g")
|
829
|
+
arrayOfCodingProb = [0.29,0.33,0.41,0.41,0.73,0.64,0.64,0.47,0.54,0.40]
|
830
|
+
elsif (base == "a")
|
831
|
+
arrayOfCodingProb = [0.21,0.81,0.65,0.67,0.49,0.62,0.55,0.44,0.49,0.28]
|
832
|
+
elsif (base == "t")
|
833
|
+
arrayOfCodingProb = [0.58,0.51,0.69,0.56,0.75,0.55,0.40,0.39,0.24,0.28]
|
834
|
+
elsif (base == "c")
|
835
|
+
arrayOfCodingProb = [0.31,0.39,0.44,0.43,0.59,0.59,0.64,0.51,0.64,0.82]
|
836
|
+
end
|
837
|
+
|
838
|
+
if (paramValue >= 0 and paramValue < 0.17)
|
839
|
+
codeProb = arrayOfCodingProb[0];
|
840
|
+
elsif (paramValue >=0.17 and paramValue < 0.19)
|
841
|
+
codeProb = arrayOfCodingProb[1];
|
842
|
+
elsif (paramValue >=0.19 and paramValue < 0.21)
|
843
|
+
codeProb = arrayOfCodingProb[2];
|
844
|
+
elsif (paramValue >=0.21 and paramValue < 0.23)
|
845
|
+
codeProb = arrayOfCodingProb[3];
|
846
|
+
elsif (paramValue >=0.23 and paramValue < 0.25)
|
847
|
+
codeProb = arrayOfCodingProb[4];
|
848
|
+
elsif (paramValue >=0.25 and paramValue < 0.27)
|
849
|
+
codeProb = arrayOfCodingProb[5];
|
850
|
+
elsif (paramValue >=0.27 and paramValue < 0.29)
|
851
|
+
codeProb = arrayOfCodingProb[6];
|
852
|
+
elsif (paramValue >=0.29 and paramValue < 0.31)
|
853
|
+
codeProb = arrayOfCodingProb[7];
|
854
|
+
elsif (paramValue >=0.31 and paramValue < 0.33)
|
855
|
+
codeProb = arrayOfCodingProb[8];
|
856
|
+
elsif (paramValue >=0.33)
|
857
|
+
codeProb = arrayOfCodingProb[9];
|
858
|
+
end
|
859
|
+
|
860
|
+
return codeProb;
|
861
|
+
end
|
862
|
+
|
863
|
+
def getConclusion (testCode_value)
|
864
|
+
codeProb = "";
|
865
|
+
if (testCode_value < 0.74)
|
866
|
+
codeProb = :unknown;
|
867
|
+
elsif (testCode_value >=0.74 and testCode_value < 0.95)
|
868
|
+
codeProb = :putative_coding;
|
869
|
+
elsif (testCode_value >=0.95)
|
870
|
+
codeProb = :coding;
|
871
|
+
end
|
872
|
+
|
873
|
+
return codeProb;
|
874
|
+
end
|
875
|
+
|
876
|
+
|
877
|
+
end
|