full_lengther_next 0.0.8 → 0.5.6
Sign up to get free protection for your applications and to get access to all the features.
- data/.gemtest +0 -0
- data/History.txt +2 -2
- data/Manifest.txt +33 -18
- data/Rakefile +4 -2
- data/bin/download_fln_dbs.rb +310 -158
- data/bin/full_lengther_next +160 -103
- data/bin/make_test_dataset.rb +236 -0
- data/bin/make_user_db.rb +101 -117
- data/bin/plot_fln.rb +270 -0
- data/bin/plot_taxonomy.rb +70 -0
- data/lib/expresscanvas.zip +0 -0
- data/lib/full_lengther_next.rb +3 -3
- data/lib/full_lengther_next/classes/artifacts.rb +66 -0
- data/lib/full_lengther_next/classes/blast_functions.rb +326 -0
- data/lib/full_lengther_next/classes/cdhit.rb +154 -0
- data/lib/full_lengther_next/classes/chimeric_seqs.rb +315 -57
- data/lib/full_lengther_next/classes/common_functions.rb +105 -63
- data/lib/full_lengther_next/classes/exonerate_result.rb +258 -0
- data/lib/full_lengther_next/classes/fl_analysis.rb +226 -617
- data/lib/full_lengther_next/classes/fl_string_utils.rb +4 -2
- data/lib/full_lengther_next/classes/fln_stats.rb +598 -557
- data/lib/full_lengther_next/classes/handle_db.rb +30 -0
- data/lib/full_lengther_next/classes/my_worker.rb +308 -138
- data/lib/full_lengther_next/classes/my_worker_EST.rb +54 -0
- data/lib/full_lengther_next/classes/my_worker_manager_EST.rb +69 -0
- data/lib/full_lengther_next/classes/my_worker_manager_fln.rb +389 -0
- data/lib/full_lengther_next/classes/nc_rna.rb +5 -7
- data/lib/full_lengther_next/classes/reptrans.rb +210 -0
- data/lib/full_lengther_next/classes/sequence.rb +439 -80
- data/lib/full_lengther_next/classes/test_code.rb +15 -16
- data/lib/full_lengther_next/classes/types.rb +12 -0
- data/lib/full_lengther_next/classes/une_los_hit.rb +148 -230
- data/lib/full_lengther_next/classes/warnings.rb +40 -0
- metadata +207 -93
- data/lib/full_lengther_next/classes/lcs.rb +0 -33
- data/lib/full_lengther_next/classes/my_worker_manager.rb +0 -240
@@ -5,7 +5,7 @@ class String
|
|
5
5
|
s = self.upcase
|
6
6
|
a = s.split('').each_slice(3).map{|e| e.join}
|
7
7
|
|
8
|
-
c={'GCT'=>'A',
|
8
|
+
c={ 'GCT'=>'A','GCC'=>'A','GCA'=>'A','GCG'=>'A',
|
9
9
|
'CGT'=>'R','CGC'=>'R','CGA'=>'R','CGG'=>'R','AGA'=>'R','AGG'=>'R',
|
10
10
|
'AAT'=>'N','AAC'=>'N',
|
11
11
|
'GAT'=>'D','GAC'=>'D',
|
@@ -37,6 +37,8 @@ class String
|
|
37
37
|
else
|
38
38
|
c[e]||'x'
|
39
39
|
end
|
40
|
+
else
|
41
|
+
'x'
|
40
42
|
end
|
41
43
|
}
|
42
44
|
return res.compact.join
|
@@ -136,4 +138,4 @@ class String
|
|
136
138
|
return self.reverse.split('').map{|e| c[e]}.join
|
137
139
|
end
|
138
140
|
|
139
|
-
end
|
141
|
+
end
|
@@ -1,600 +1,641 @@
|
|
1
|
+
require 'types.rb'
|
1
2
|
|
2
3
|
module FlnStats
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
stats_file.puts ' <tr>
|
60
|
-
<td colspan="2" align="left">Putative chimera</td>
|
61
|
-
<td align="right">'+chimera_total.to_s+'</td>
|
62
|
-
<td align="right">'+'%.2f' % (100*chimera_total.to_f/total_seqs.to_f).to_s+' %</td>
|
63
|
-
</tr>'
|
64
|
-
status_suma += chimera_total
|
65
|
-
|
66
|
-
# añadimos los coding, P.coding
|
67
|
-
tcode_array.each do |status|
|
68
|
-
if (status[1] == 'Coding')
|
69
|
-
stats_file.puts ' <tr>
|
70
|
-
<td rowspan="2" align="left">'+status[1].to_s+'</td>
|
71
|
-
<td align="left">Sure</td>
|
72
|
-
<td align="right">'+status[0].to_s+'</td>
|
73
|
-
<td align="right">'+'%.2f' % (100*status[0].to_f/total_seqs.to_f).to_s+' %</td>
|
74
|
-
</tr>'
|
75
|
-
elsif (status[1] == 'Putative Coding')
|
76
|
-
stats_file.puts ' <tr>
|
77
|
-
<td align="left">Putative</td>
|
78
|
-
<td align="right">'+status[0].to_s+'</td>
|
79
|
-
<td align="right">'+'%.2f' % (100*status[0].to_f/total_seqs.to_f).to_s+' %</td>
|
80
|
-
</tr>'
|
4
|
+
def initialize_stats_hash
|
5
|
+
stats_hash = {}
|
6
|
+
stats_hash['input_seqs'] = 0
|
7
|
+
stats_hash['output_seqs'] = 0
|
8
|
+
stats_hash['failed'] = 0
|
9
|
+
stats_hash['sequences_>200'] = 0
|
10
|
+
stats_hash['sequences_>500'] = 0
|
11
|
+
stats_hash['longest_unigene'] = 0
|
12
|
+
stats_hash['good_seqs'] = 0
|
13
|
+
stats_hash['artifacts'] = 0
|
14
|
+
stats_hash['misassembled'] = 0
|
15
|
+
stats_hash['chimeras'] = 0
|
16
|
+
stats_hash['other_artifacts'] = 0
|
17
|
+
stats_hash['unknown'] = 0
|
18
|
+
stats_hash['unknown_>200'] = 0
|
19
|
+
stats_hash['unknown_>500'] = 0
|
20
|
+
stats_hash['prot_annotated'] = 0
|
21
|
+
stats_hash['complete'] = 0
|
22
|
+
stats_hash['complete_sure'] = 0
|
23
|
+
stats_hash['complete_putative'] = 0
|
24
|
+
stats_hash['n_terminal'] = 0
|
25
|
+
stats_hash['n_terminal_sure'] = 0
|
26
|
+
stats_hash['n_terminal_putative'] = 0
|
27
|
+
stats_hash['c_terminal'] = 0
|
28
|
+
stats_hash['c_terminal_sure'] = 0
|
29
|
+
stats_hash['c_terminal_putative'] = 0
|
30
|
+
stats_hash['internal'] = 0
|
31
|
+
stats_hash['swissprot'] = 0
|
32
|
+
stats_hash['trembl'] = 0
|
33
|
+
stats_hash['userdb'] = 0
|
34
|
+
stats_hash['ncrna'] = 0
|
35
|
+
stats_hash['coding'] = 0
|
36
|
+
stats_hash['coding_sure'] = 0
|
37
|
+
stats_hash['coding_putative'] = 0
|
38
|
+
stats_hash['coding_>200'] = 0
|
39
|
+
stats_hash['coding_>500'] = 0
|
40
|
+
stats_hash['different_orthologues'] = 0
|
41
|
+
stats_hash['different_completes'] = 0
|
42
|
+
stats_hash['BA_index'] = 0
|
43
|
+
|
44
|
+
return stats_hash
|
45
|
+
end
|
46
|
+
|
47
|
+
def get_taxonomy(name, taxonomy)
|
48
|
+
organism = nil
|
49
|
+
if name.include?('OS=')
|
50
|
+
fields = name.split('OS=',2)
|
51
|
+
organism = fields.last.split(' GN=').first.strip
|
52
|
+
elsif name[0..2] = 'sp=' || name[0..2] = 'tr='
|
53
|
+
name =~ /(\w+ \w+) \(([\w ]+)\) \(([\w ]+)\)/
|
54
|
+
if !$1.nil?
|
55
|
+
organism = $1
|
56
|
+
else
|
57
|
+
name =~ /(\w+ \w+) \(([\w ]+)\)/
|
58
|
+
if !$1.nil?
|
59
|
+
organism = $1
|
81
60
|
end
|
82
|
-
status_suma += status[0]
|
83
61
|
end
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
tcode_array.each do |status|
|
94
|
-
if (status[1] =~ /Unknown/i)
|
95
|
-
stats_file.puts ' <tr>
|
96
|
-
<td colspan="2" align="left">'+status[1].to_s+'</td>
|
97
|
-
<td align="right">'+status[0].to_s+'</td>
|
98
|
-
<td align="right">'+'%.2f' % (100*status[0].to_f/total_seqs.to_f).to_s+' %</td>
|
99
|
-
</tr>'
|
100
|
-
end
|
62
|
+
else
|
63
|
+
organism = name.split(";",2).last
|
64
|
+
organism = organism.split('.', 2).first
|
65
|
+
organism.gsub!(/\(\D+\)/,'')
|
66
|
+
if organism.split(' ').length > 1
|
67
|
+
organism.gsub!('.','')
|
68
|
+
organism.gsub!(/^ /,'')
|
69
|
+
organism.gsub!(' ','')
|
70
|
+
organism.strip!
|
101
71
|
end
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
new_genes = tcode_array[0][0] + tcode_array[1][0]
|
113
|
-
total_uni = (seqs_number1 + new_genes + ncrna_total + tcode_array[2][0])
|
114
|
-
stats_file.puts html_uni
|
115
|
-
stats_file.puts ' <tr>
|
116
|
-
<td align="left">With orthologue in DBs</td>
|
117
|
-
<td align="right">'+seqs_number1.to_s+'</td>
|
118
|
-
<td align="right">'+'%.2f' % (100*seqs_number1.to_f/total_seqs.to_f).to_s+' %</td>
|
119
|
-
</tr>'
|
120
|
-
stats_file.puts ' <tr>
|
121
|
-
<td align="left">Putative New Genes</td>
|
122
|
-
<td align="right">'+new_genes.to_s+'</td>
|
123
|
-
<td align="right">'+'%.2f' % (100*new_genes.to_f/total_seqs.to_f).to_s+' %</td>
|
124
|
-
</tr>'
|
125
|
-
stats_file.puts ' <tr>
|
126
|
-
<td align="left">ncRNAs</td>
|
127
|
-
<td align="right">'+ncrna_total.to_s+'</td>
|
128
|
-
<td align="right">'+'%.2f' % (100*ncrna_total.to_f/total_seqs.to_f).to_s+' %</td>
|
129
|
-
</tr>'
|
130
|
-
stats_file.puts ' <tr>
|
131
|
-
<td align="left">Unknown</td>
|
132
|
-
<td align="right">'+tcode_array[2][0].to_s+'</td>
|
133
|
-
<td align="right">'+'%.2f' % (100*tcode_array[2][0].to_f/total_seqs.to_f).to_s+' %</td>
|
134
|
-
</tr>'
|
135
|
-
stats_file.puts ' <tr>
|
136
|
-
<td align="left">Total</td>
|
137
|
-
<td align="right">'+total_uni.to_s+'</td>
|
138
|
-
<td align="right">'+'%.2f' % (100*total_uni.to_f/total_seqs.to_f).to_s+' %</td>
|
139
|
-
</tr>
|
140
|
-
</table>'
|
72
|
+
end
|
73
|
+
if !organism.nil?
|
74
|
+
organism = organism.split(' ')[0..1].join(' ')
|
75
|
+
if taxonomy[organism].nil?
|
76
|
+
taxonomy[organism] = 1
|
77
|
+
else
|
78
|
+
taxonomy[organism] += 1
|
79
|
+
end
|
80
|
+
end
|
81
|
+
end
|
141
82
|
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
</table>'
|
168
|
-
|
169
|
-
# imprimimos la tabla Report guiding assembly quality -------------------------------------------------------------
|
170
|
-
stats_file.puts html_as
|
171
|
-
stats_file.puts ' <tr>
|
172
|
-
<td align="left">Unigenes</td>
|
173
|
-
<td align="right">'+total_seqs.to_s+'</td>
|
174
|
-
<td align="right">'+'%.2f' % (100*total_seqs.to_f/total_seqs.to_f).to_s+' %</td>
|
175
|
-
</tr>'
|
176
|
-
stats_file.puts ' <tr>
|
177
|
-
<td align="left">Unigenes >'+size_filter2.to_s+'pb</td>
|
178
|
-
<td align="right">'+uni_500.to_s+'</td>
|
179
|
-
<td align="right">'+'%.2f' % (100*uni_500.to_f/total_seqs.to_f).to_s+' %</td>
|
180
|
-
</tr>'
|
181
|
-
stats_file.puts ' <tr>
|
182
|
-
<td align="left">Unigenes >'+size_filter1.to_s+'pb</td>
|
183
|
-
<td align="right">'+uni_200.to_s+'</td>
|
184
|
-
<td align="right">'+'%.2f' % (100*uni_200.to_f/total_seqs.to_f).to_s+' %</td>
|
185
|
-
</tr>'
|
186
|
-
stats_file.puts ' <tr>
|
187
|
-
<td align="left">Longest unigene</td>
|
188
|
-
<td align="right">'+longest_one.to_s+'</td>
|
189
|
-
<td align="right">-</td>
|
190
|
-
</tr>'
|
191
|
-
stats_file.puts ' <tr>
|
192
|
-
<td align="left">With orthologue <sup>1</sup></td>
|
193
|
-
<td align="right">'+seqs_number1.to_s+'</td>
|
194
|
-
<td align="right">'+'%.2f' % (100*seqs_number1.to_f/total_seqs.to_f).to_s+' %</td>
|
195
|
-
</tr>'
|
196
|
-
|
197
|
-
if (seqs_number1.to_i > 0)
|
198
|
-
stats_file.puts ' <tr>
|
199
|
-
<td align="left"> Different orthologue IDs</td>
|
200
|
-
<td align="right">'+seq_uniq.to_s+'</td>
|
201
|
-
<td align="right">'+'%.2f' % (100*seq_uniq.to_f/seqs_number1.to_f).to_s+' %</td>
|
202
|
-
</tr>'
|
203
|
-
stats_file.puts ' <tr>
|
204
|
-
<td align="left"> Complete transcripts</td>
|
205
|
-
<td align="right">'+status_array[0][0].to_s+'</td>
|
206
|
-
<td align="right">'+'%.2f' % (100*status_array[0][0].to_f/seqs_number1.to_f).to_s+' %</td>
|
207
|
-
</tr>'
|
208
|
-
stats_file.puts ' <tr>
|
209
|
-
<td align="left"> Different complete transcripts</td>
|
210
|
-
<td align="right">'+complete_uniq.to_s+'</td>
|
211
|
-
<td align="right">'+'%.2f' % (100*complete_uniq.to_f/seqs_number1.to_f).to_s+' %</td>
|
212
|
-
</tr>'
|
213
|
-
stats_file.puts ' <tr>
|
214
|
-
<td align="left"> Misassembled</td>
|
215
|
-
<td align="right">'+error_1_num.to_s+'</td>
|
216
|
-
<td align="right">'+'%.2f' % (100*error_1_num.to_f/seqs_number1.to_f).to_s+' %</td>
|
217
|
-
</tr>'
|
218
|
-
stats_file.puts ' <tr>
|
219
|
-
<td align="left"> Putative chimera</td>
|
220
|
-
<td align="right">'+chimera_total.to_s+'</td>
|
221
|
-
<td align="right">'+'%.2f' % (100*chimera_total.to_f/seqs_number1.to_f).to_s+' %</td>
|
222
|
-
</tr>'
|
223
|
-
end
|
224
|
-
stats_file.puts ' <tr>
|
225
|
-
<td align="left">Without orthologue <sup>1</sup></td>
|
226
|
-
<td align="right">'+no_db.to_s+'</td>
|
227
|
-
<td align="right">'+'%.2f' % (100*seqs_number2.to_f/total_seqs.to_f).to_s+' %</td>
|
228
|
-
</tr>'
|
229
|
-
|
230
|
-
if (no_db.to_i > 0) && (seqs_number2.to_i > 0)
|
231
|
-
stats_file.puts ' <tr>
|
232
|
-
<td align="left"> Coding (all)</td>
|
233
|
-
<td align="right">'+tcode_array[0][0].to_s+'</td>
|
234
|
-
<td align="right">'+'%.2f' % (100*tcode_array[0][0].to_f/no_db.to_f).to_s+' %</td>
|
235
|
-
</tr>'
|
236
|
-
stats_file.puts ' <tr>
|
237
|
-
<td align="left"> Coding > '+size_filter1.to_s+'bp</td>
|
238
|
-
<td align="right">'+tcode_array[0][2].to_s+'</td>
|
239
|
-
<td align="right">'+'%.2f' % (100*tcode_array[0][2].to_f/no_db.to_f).to_s+' %</td>
|
240
|
-
</tr>'
|
241
|
-
stats_file.puts ' <tr>
|
242
|
-
<td align="left"> Coding > '+size_filter2.to_s+'bp</td>
|
243
|
-
<td align="right">'+tcode_array[0][3].to_s+'</td>
|
244
|
-
<td align="right">'+'%.2f' % (100*tcode_array[0][3].to_f/no_db.to_f).to_s+' %</td>
|
245
|
-
</tr>'
|
246
|
-
stats_file.puts ' <tr>
|
247
|
-
<td align="left"> Putative Coding (all)</td>
|
248
|
-
<td align="right">'+tcode_array[1][0].to_s+'</td>
|
249
|
-
<td align="right">'+'%.2f' % (100*tcode_array[1][0].to_f/no_db.to_f).to_s+' %</td>
|
250
|
-
</tr>'
|
251
|
-
stats_file.puts ' <tr>
|
252
|
-
<td align="left"> Putative Coding > '+size_filter1.to_s+'bp</td>
|
253
|
-
<td align="right">'+tcode_array[1][2].to_s+'</td>
|
254
|
-
<td align="right">'+'%.2f' % (100*tcode_array[1][2].to_f/no_db.to_f).to_s+' %</td>
|
255
|
-
</tr>'
|
256
|
-
stats_file.puts ' <tr>
|
257
|
-
<td align="left"> Putative Coding > '+size_filter2.to_s+'bp</td>
|
258
|
-
<td align="right">'+tcode_array[1][3].to_s+'</td>
|
259
|
-
<td align="right">'+'%.2f' % (100*tcode_array[1][3].to_f/no_db.to_f).to_s+' %</td>
|
260
|
-
</tr>'
|
261
|
-
stats_file.puts ' <tr>
|
262
|
-
<td align="left"> Putative ncRNA</td>
|
263
|
-
<td align="right">'+ncrna_total.to_s+'</td>
|
264
|
-
<td align="right">'+'%.2f' % (100*ncrna_total.to_f/no_db.to_f).to_s+' %</td>
|
265
|
-
</tr>'
|
266
|
-
stats_file.puts ' <tr>
|
267
|
-
<td align="left"> Unknown (all)</td>
|
268
|
-
<td align="right">'+tcode_array[2][0].to_s+'</td>
|
269
|
-
<td align="right">'+'%.2f' % (100*tcode_array[2][0].to_f/no_db.to_f).to_s+' %</td>
|
270
|
-
</tr>'
|
271
|
-
stats_file.puts ' <tr>
|
272
|
-
<td align="left"> Unknown > '+size_filter1.to_s+'bp</td>
|
273
|
-
<td align="right">'+tcode_array[2][2].to_s+'</td>
|
274
|
-
<td align="right">'+'%.2f' % (100*tcode_array[2][2].to_f/no_db.to_f).to_s+' %</td>
|
275
|
-
</tr>'
|
276
|
-
stats_file.puts ' <tr>
|
277
|
-
<td align="left"> Unknown > '+size_filter2.to_s+'bp</td>
|
278
|
-
<td align="right">'+tcode_array[2][3].to_s+'</td>
|
279
|
-
<td align="right">'+'%.2f' % (100*tcode_array[2][3].to_f/no_db.to_f).to_s+' %</td>
|
280
|
-
</tr>'
|
281
|
-
end
|
282
|
-
stats_file.puts ' </table>
|
283
|
-
<sup>1</sup> Percents for subclassifications of this category were calculated using this line as 100% reference.'
|
284
|
-
|
83
|
+
def initialize_stats_hash_reptrans
|
84
|
+
stats_hash = {}
|
85
|
+
stats_hash['prot_annotated'] = 0
|
86
|
+
stats_hash['est_annotated'] = 0
|
87
|
+
stats_hash['coding_>1'] = 0
|
88
|
+
stats_hash['coding_>0.94'] = 0
|
89
|
+
stats_hash['coding_>0.84'] = 0
|
90
|
+
stats_hash['coding_>0.73'] = 0
|
91
|
+
stats_hash['coding_>0'] = 0
|
92
|
+
return stats_hash
|
93
|
+
end
|
94
|
+
|
95
|
+
def summary_stats(seqs, stats_hash, diff_ids_array, diff_ids_complete_array)
|
96
|
+
low_limit = 200
|
97
|
+
upper_limit = 500
|
98
|
+
#All seqs
|
99
|
+
#-----------
|
100
|
+
stats_hash['output_seqs'] += seqs.length
|
101
|
+
good_seqs = seqs.select{|s| s.type >= UNKNOWN}
|
102
|
+
stats_hash['good_seqs'] += good_seqs.length
|
103
|
+
|
104
|
+
#Longest_unigene
|
105
|
+
current_longest_unigene = seqs.map{|s| s.fasta_length}.max
|
106
|
+
if current_longest_unigene > stats_hash['longest_unigene']
|
107
|
+
stats_hash['longest_unigene'] = current_longest_unigene
|
285
108
|
end
|
286
|
-
|
109
|
+
|
110
|
+
#Load ids
|
111
|
+
seqs.map{|s|
|
112
|
+
if s.type > UNKNOWN && s.type < NCRNA
|
113
|
+
diff_ids_array << s.hit.acc
|
114
|
+
end}
|
115
|
+
diff_ids_array.uniq!
|
116
|
+
|
117
|
+
#By Length
|
118
|
+
stats_hash['sequences_>200'] += good_seqs.select{|s| s.fasta_length > low_limit}.length
|
119
|
+
stats_hash['sequences_>500'] += good_seqs.select{|s| s.fasta_length > upper_limit}.length
|
120
|
+
|
121
|
+
stats_hash['failed'] += seqs.select{|s| s.type == FAILED}.length
|
122
|
+
|
123
|
+
#Unknown
|
124
|
+
#-----------------------------
|
125
|
+
all_unknown = seqs.select{|s| s.type == UNKNOWN}
|
126
|
+
stats_hash['unknown'] += all_unknown.length
|
127
|
+
|
128
|
+
#By Length
|
129
|
+
stats_hash['unknown_>200'] += all_unknown.select{|s| s.fasta_length > low_limit}.length
|
130
|
+
stats_hash['unknown_>500'] += all_unknown.select{|s| s.fasta_length > upper_limit}.length
|
131
|
+
|
132
|
+
#Artifacts
|
133
|
+
#----------------
|
134
|
+
stats_hash['artifacts'] += seqs.select{|s| s.type < UNKNOWN && s.type > FAILED}.length
|
135
|
+
stats_hash['misassembled'] += seqs.select{|s| s.type == MISASSEMBLED}.length
|
136
|
+
stats_hash['chimeras'] += seqs.select{|s| s.type == CHIMERA && !s.seq_name.include?('_split_')}.length # We don't want count a multiple chimera
|
137
|
+
stats_hash['other_artifacts'] += seqs.select{|s| s.type == OTHER}.length
|
287
138
|
|
288
|
-
|
139
|
+
#Annotated with prot
|
140
|
+
#---------------------
|
141
|
+
prot_annotated = seqs.select{|s| s.type >= COMPLETE && s.type <= INTERNAL}
|
142
|
+
stats_hash['prot_annotated'] += prot_annotated.length
|
143
|
+
|
144
|
+
#By annotation
|
145
|
+
stats_hash['internal'] += seqs.select{|s| s.type == INTERNAL}.length
|
146
|
+
complete = seqs.select{|s| s.type == COMPLETE}
|
147
|
+
n_terminal = seqs.select{|s| s.type == N_TERMINAL}
|
148
|
+
c_terminal = seqs.select{|s| s.type == C_TERMINAL}
|
149
|
+
|
150
|
+
stats_hash['complete'] += complete.length
|
151
|
+
stats_hash['n_terminal'] += n_terminal.length
|
152
|
+
stats_hash['c_terminal'] += c_terminal.length
|
153
|
+
|
154
|
+
#Load complete ids
|
155
|
+
complete.map{|s| diff_ids_complete_array << s.hit.acc}
|
156
|
+
diff_ids_complete_array.uniq!
|
157
|
+
|
158
|
+
#----> By Status
|
159
|
+
stats_hash['complete_sure'] += complete.select{|s| s.status}.length
|
160
|
+
stats_hash['n_terminal_sure'] += n_terminal.select{|s| s.status}.length
|
161
|
+
stats_hash['c_terminal_sure'] += c_terminal.select{|s| s.status}.length
|
162
|
+
stats_hash['complete_putative'] += complete.select{|s| !s.status}.length
|
163
|
+
stats_hash['n_terminal_putative'] += n_terminal.select{|s| !s.status}.length
|
164
|
+
stats_hash['c_terminal_putative'] += c_terminal.select{|s| !s.status}.length
|
165
|
+
|
166
|
+
#By database
|
167
|
+
swissprot = prot_annotated.select{|s| s.db_name =~ /^sp_/}.length
|
168
|
+
trembl = prot_annotated.select{|s| s.db_name =~ /^tr_/}.length
|
169
|
+
stats_hash['swissprot'] += swissprot
|
170
|
+
stats_hash['trembl'] += trembl
|
171
|
+
stats_hash['userdb'] += prot_annotated.length - swissprot - trembl
|
172
|
+
|
173
|
+
#ncRNA
|
174
|
+
#----------------
|
175
|
+
stats_hash['ncrna'] += seqs.select{|s| s.type == NCRNA}.length
|
176
|
+
|
177
|
+
#Coding sequences
|
178
|
+
#----------------
|
179
|
+
coding = seqs.select{|s| s.type == CODING}
|
180
|
+
stats_hash['coding'] += coding.length
|
181
|
+
|
182
|
+
#By Status
|
183
|
+
stats_hash['coding_sure'] += coding.select{|s| s.status}.length
|
184
|
+
stats_hash['coding_putative'] += coding.select{|s| !s.status}.length
|
185
|
+
|
186
|
+
#By Length
|
187
|
+
stats_hash['coding_>200'] += coding.select{|s| s.fasta_length > low_limit}.length
|
188
|
+
stats_hash['coding_>500'] += coding.select{|s| s.fasta_length > upper_limit}.length
|
189
|
+
|
190
|
+
|
191
|
+
return stats_hash, diff_ids_array, diff_ids_complete_array
|
289
192
|
end
|
290
193
|
|
194
|
+
def last_stats(stats_hash, diff_ids_array, diff_ids_complete_array)
|
195
|
+
stats_hash['different_orthologues'] = diff_ids_array.length
|
196
|
+
stats_hash['different_completes'] = diff_ids_complete_array.length
|
197
|
+
#BA index
|
198
|
+
if stats_hash['prot_annotated'] > 0 &&
|
199
|
+
stats_hash['complete'] > 0 &&
|
200
|
+
stats_hash['sequences_>500'] > 0 &&
|
201
|
+
stats_hash['different_orthologues'] > 0 &&
|
202
|
+
stats_hash['different_completes'] > 0
|
203
|
+
coef_anot_geom = (stats_hash['prot_annotated'] * stats_hash['complete'] * 1.0)/(stats_hash['sequences_>500']*10000)
|
204
|
+
coef_mejora = (stats_hash['different_orthologues']*1.0 + stats_hash['different_completes'])/(stats_hash['prot_annotated'] + stats_hash['complete'])
|
205
|
+
stats_hash['BA_index'] = Math.sqrt(coef_anot_geom*coef_mejora)
|
206
|
+
end
|
207
|
+
|
208
|
+
return stats_hash
|
209
|
+
end
|
291
210
|
|
211
|
+
def coding_stats_reptrans(coding_seq, stats_hash)
|
212
|
+
group = nil
|
213
|
+
if coding_seq.t_code > 1
|
214
|
+
group = 'coding_>1'
|
215
|
+
elsif coding_seq.t_code > 0.95
|
216
|
+
group = 'coding_>0.94'
|
217
|
+
elsif coding_seq.t_code > 0.85
|
218
|
+
group = 'coding_>0.84'
|
219
|
+
elsif coding_seq.t_code > 0.73
|
220
|
+
group = 'coding_>0.73'
|
221
|
+
elsif coding_seq.t_code > 0
|
222
|
+
group = 'coding_>0'
|
223
|
+
end
|
224
|
+
if !group.nil?
|
225
|
+
stats_hash[group] += 1
|
226
|
+
end
|
227
|
+
end
|
292
228
|
|
229
|
+
def write_summary_stats(stats_hash, stats_taxonomy, diff_ids_array, diff_ids_complete_array, txt_file, html_file)
|
230
|
+
stats_hash = last_stats(stats_hash, diff_ids_array, diff_ids_complete_array)
|
231
|
+
write_txt(stats_hash, txt_file)
|
232
|
+
write_html(stats_hash, html_file, stats_taxonomy)
|
233
|
+
end
|
293
234
|
|
294
|
-
|
295
|
-
|
296
|
-
|
297
|
-
|
298
|
-
|
299
|
-
|
300
|
-
|
301
|
-
|
302
|
-
|
303
|
-
|
304
|
-
|
305
|
-
|
306
|
-
|
307
|
-
html_1 = '
|
308
|
-
<h2 align="center">
|
309
|
-
Status report
|
310
|
-
</h2>
|
311
|
-
|
312
|
-
<table border="2" cellspacing="0" cellpadding="2">
|
313
|
-
<tr>
|
314
|
-
<th colspan="2">Status</th>
|
315
|
-
<th>Unigenes</th>
|
316
|
-
<th>%</th>
|
317
|
-
</tr>'
|
318
|
-
|
319
|
-
html_2= '
|
320
|
-
<h2 align="center">
|
321
|
-
Unigene report
|
322
|
-
</h2>
|
323
|
-
|
324
|
-
<table border="2" cellspacing="0" cellpadding="2">
|
325
|
-
<tr>
|
326
|
-
<th></th>
|
327
|
-
<th>Unigenes</th>
|
328
|
-
<th>%</th>
|
329
|
-
</tr>'
|
330
|
-
|
331
|
-
html_3= '
|
332
|
-
<h2 align="center">
|
333
|
-
Database usage
|
334
|
-
</h2>
|
335
|
-
|
336
|
-
<table border="2" cellspacing="0" cellpadding="2">
|
337
|
-
<tr>
|
338
|
-
<th></th>
|
339
|
-
<th>Unigenes</th>
|
340
|
-
<th>%</th>
|
341
|
-
</tr>'
|
342
|
-
|
343
|
-
html_4= '
|
344
|
-
<h2 align="center">
|
345
|
-
Report guiding assembly quality
|
346
|
-
</h2>
|
347
|
-
|
348
|
-
<table border="2" cellspacing="0" cellpadding="2">
|
349
|
-
<tr>
|
350
|
-
<th></th>
|
351
|
-
<th>Unigenes</th>
|
352
|
-
<th>%</th>
|
353
|
-
</tr>'
|
354
|
-
|
355
|
-
html_5 = ' </body>
|
356
|
-
</html>'
|
357
|
-
|
358
|
-
return [html_head, html_1, html_2, html_3, html_4, html_5]
|
235
|
+
def write_reptrans_stats(stats_hash, html_file, txt_file)
|
236
|
+
html = File.open(html_file,'w')
|
237
|
+
txt = File.open(txt_file,'w')
|
238
|
+
write_txt(stats_hash, txt)
|
239
|
+
write_html_reptrans(stats_hash, html)
|
240
|
+
end
|
241
|
+
|
242
|
+
def write_html_reptrans(stats_hash, html_file)
|
243
|
+
html_file.puts '<html>'
|
244
|
+
header(html_file)
|
245
|
+
body_reptrans(html_file, stats_hash)
|
246
|
+
html_file.puts '</html>'
|
247
|
+
end
|
359
248
|
|
249
|
+
def write_txt(stats_hash, file)
|
250
|
+
stats_hash.each do |key, value|
|
251
|
+
file.puts "#{value}\t#{key}"
|
360
252
|
end
|
253
|
+
end
|
361
254
|
|
255
|
+
def write_html(stats_hash, html_file, stats_taxonomy)
|
256
|
+
js_path = File.dirname(html_file.to_path)
|
257
|
+
system("unzip -qq #{File.join(File.dirname(__FILE__), '..', '..', 'expresscanvas.zip')} -d #{js_path}") if !File.exists?(File.join(js_path, 'expresscanvas'))
|
258
|
+
html_file.puts '<html>'
|
259
|
+
html_header(html_file, stats_hash, stats_taxonomy)
|
260
|
+
body(html_file, stats_hash)
|
261
|
+
html_file.puts '</html>'
|
262
|
+
end
|
362
263
|
|
363
|
-
|
364
|
-
|
365
|
-
|
366
|
-
|
367
|
-
|
368
|
-
error_1_num = 0
|
369
|
-
uni_500 = 0
|
370
|
-
uni_200 = 0
|
371
|
-
longest_one = 0
|
372
|
-
|
373
|
-
status_array = []
|
374
|
-
# total, status
|
375
|
-
complete = [0,'Complete']
|
376
|
-
putative_complete = [0,'Putative Complete']
|
377
|
-
c_terminus = [0,'C-terminus']
|
378
|
-
putative_c_terminus = [0,'Putative C-terminus']
|
379
|
-
n_terminus = [0,'N-terminus']
|
380
|
-
putative_n_terminus = [0,'Putative N-terminus']
|
381
|
-
internal = [0,'Internal']
|
382
|
-
cod_seq = [0,'Misassembled']
|
383
|
-
|
384
|
-
#userdb, SwissProt, TrEMBL
|
385
|
-
db_usage = [0,0,0]
|
386
|
-
|
387
|
-
File.open('fln_results/dbannotated.txt').each do |line|
|
388
|
-
line.chomp!
|
389
|
-
(name,fasta_length,acc,db_name,status,kk1,kk2,kk3,kk4,kk5,msgs) = line.split("\t")
|
390
|
-
|
391
|
-
if (line !~ /^Query_id\t/) && (!line.empty?)
|
392
|
-
seqs_number += 1
|
393
|
-
if (fasta_length.to_i > longest_one)
|
394
|
-
longest_one = fasta_length.to_i
|
395
|
-
end
|
396
|
-
array_of_all_accs.push acc
|
397
|
-
|
398
|
-
if (db_name !~ /^sp_/) && (db_name !~ /^tr_/)
|
399
|
-
db_usage[0] += 1
|
400
|
-
elsif (db_name =~ /^sp_/)
|
401
|
-
db_usage[1] += 1
|
402
|
-
elsif (db_name =~ /^tr_/)
|
403
|
-
db_usage[2] += 1
|
404
|
-
end
|
405
|
-
|
406
|
-
# -------------------------------------------------------------------------
|
407
|
-
if (fasta_length.to_i >= size_filter1)
|
408
|
-
uni_200 += 1
|
409
|
-
end
|
410
|
-
if (fasta_length.to_i >= size_filter2)
|
411
|
-
uni_500 += 1
|
412
|
-
end
|
413
|
-
# -------------------------------------------------------------------------
|
414
|
-
if (msgs =~ /ERROR#1/)
|
415
|
-
error_1_num += 1
|
416
|
-
end
|
417
|
-
# -------------------------------------------------------------------------
|
418
|
-
if (status == 'Complete')
|
419
|
-
complete[0] += 1
|
420
|
-
array_of_complete_accs.push acc
|
421
|
-
|
422
|
-
elsif (status == 'Putative Complete')
|
423
|
-
putative_complete[0] += 1
|
424
|
-
elsif (status == 'C-terminus')
|
425
|
-
c_terminus[0] += 1
|
426
|
-
elsif (status == 'N-terminus')
|
427
|
-
n_terminus[0] += 1
|
428
|
-
elsif (status == 'Putative C-terminus')
|
429
|
-
putative_c_terminus[0] += 1
|
430
|
-
elsif (status == 'Putative N-terminus')
|
431
|
-
putative_n_terminus[0] += 1
|
432
|
-
elsif (status == 'Internal')
|
433
|
-
internal[0] += 1
|
434
|
-
elsif (status == 'Misassembled')
|
435
|
-
cod_seq[0] += 1
|
436
|
-
end
|
437
|
-
# -------------------------------------------------------------------------
|
438
|
-
end
|
264
|
+
def header(html_file)
|
265
|
+
html_file.puts '<head>',
|
266
|
+
'<title>FLN Summary</title>',
|
267
|
+
'</head>'
|
268
|
+
end
|
439
269
|
|
440
|
-
|
270
|
+
def html_header(html_file, stats_hash, stats_taxonomy)
|
271
|
+
structural_data_sure = []
|
272
|
+
structural_data_sure << stats_hash['unknown']
|
273
|
+
structural_data_sure << stats_hash['complete_sure']
|
274
|
+
structural_data_sure << stats_hash['n_terminal_sure']
|
275
|
+
structural_data_sure << stats_hash['c_terminal_sure']
|
276
|
+
structural_data_sure << stats_hash['internal']
|
277
|
+
structural_data_sure << stats_hash['ncrna']
|
278
|
+
structural_data_sure << stats_hash['coding']
|
279
|
+
|
280
|
+
structural_data_putative = []
|
281
|
+
structural_data_putative << 0
|
282
|
+
structural_data_putative << stats_hash['complete_putative']
|
283
|
+
structural_data_putative << stats_hash['n_terminal_putative']
|
284
|
+
structural_data_putative << stats_hash['c_terminal_putative']
|
285
|
+
structural_data_putative << 0
|
286
|
+
structural_data_putative << 0
|
287
|
+
structural_data_putative << stats_hash['coding_putative']
|
288
|
+
|
289
|
+
values_structural_sure = "[#{structural_data_sure.map{|stat| stat*100.0/stats_hash['good_seqs']}.join(', ')}]"
|
290
|
+
values_structural_putative = "[#{structural_data_putative.map{|stat| stat*100.0/stats_hash['good_seqs']}.join(', ')}]"
|
291
|
+
|
292
|
+
data = stats_taxonomy.to_a.sort{|s2, s1| s1.last <=> s2.last}[0..20]
|
293
|
+
smps_taxonomy = "['#{data.map{|tax| tax.first}.join("', '")}']"
|
294
|
+
values_taxonomy = "[#{data.map{|tax| tax.last}.join(', ')}]"
|
295
|
+
|
296
|
+
html_file.puts '<head>
|
297
|
+
<title>FLN Summary</title>
|
298
|
+
<meta http-equiv="CACHE-CONTROL" CONTENT="NO-CACHE">
|
299
|
+
<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
|
441
300
|
|
442
|
-
|
301
|
+
<!--[if lt IE 9]><script type="text/javascript" src="./expresscanvas/js/flashcanvas.js"></script><![endif]-->
|
302
|
+
<script type="text/javascript" src="./expresscanvas/js/canvasXpress.min.js"></script>
|
443
303
|
|
444
|
-
|
445
|
-
|
304
|
+
<script id=\'demoScript\'>
|
305
|
+
var showDemo = function () {'
|
446
306
|
|
307
|
+
#'smpTitle': 'Status',
|
308
|
+
|
309
|
+
html_file.puts "new CanvasXpress('profile',
|
310
|
+
{
|
311
|
+
'y' : {
|
312
|
+
'vars' : ['Sure', 'Putative'],
|
313
|
+
'smps' : ['Unknown', 'Complete', 'N-terminal', 'C-terminal', 'Internal', 'ncrna', 'Coding'],
|
314
|
+
'data' : [#{values_structural_sure},
|
315
|
+
#{values_structural_putative}],
|
316
|
+
},
|
317
|
+
'a' : {
|
318
|
+
'xAxis' : ['Sure', 'Putative']
|
319
|
+
},
|
320
|
+
},
|
321
|
+
|
322
|
+
{'gradient': false,
|
323
|
+
'toolbarPermanent': true,
|
324
|
+
'graphOrientation': 'vertical',
|
325
|
+
'graphType': 'Stacked',
|
326
|
+
'legendBackgroundColor': false,
|
327
|
+
'smpLabelScaleFontFactor': 0.8,
|
328
|
+
'xAxisTitle': '% sequences',
|
329
|
+
'xAxis2Show': false,
|
330
|
+
'xAxisExact': true,
|
331
|
+
'setMaxX': 80,
|
332
|
+
'setMinX': 0,
|
333
|
+
'axisTitleScaleFontFactor': 2,
|
334
|
+
'smpTitleFontStyle': 'italic',
|
335
|
+
'titleHeight': 60
|
336
|
+
}
|
337
|
+
);
|
338
|
+
|
339
|
+
new CanvasXpress('taxonomy',
|
340
|
+
{
|
341
|
+
'y' : {
|
342
|
+
'vars' : ['Annotations'],
|
343
|
+
'smps' : #{smps_taxonomy},
|
344
|
+
'data' : [#{values_taxonomy}],
|
345
|
+
},
|
346
|
+
'a' : {
|
347
|
+
'xAxis' : ['Sure', 'Putative']
|
348
|
+
},
|
349
|
+
},
|
350
|
+
|
351
|
+
{'gradient': false,
|
352
|
+
'toolbarPermanent': true,
|
353
|
+
'graphOrientation': 'horizontal',
|
354
|
+
'showLegend': false,
|
355
|
+
'smpLabelScaleFontFactor': 1.5,
|
356
|
+
'xAxisTitle': 'Number of sequences',
|
357
|
+
'xAxis2Show': false,
|
358
|
+
'titleHeight': 60
|
359
|
+
}
|
360
|
+
);
|
361
|
+
}
|
362
|
+
</script>
|
363
|
+
</head>"
|
364
|
+
|
365
|
+
end
|
447
366
|
|
448
|
-
|
449
|
-
|
450
|
-
|
451
|
-
|
452
|
-
|
453
|
-
|
454
|
-
|
455
|
-
|
456
|
-
|
457
|
-
|
458
|
-
|
459
|
-
|
460
|
-
File.open('fln_results/new_coding.txt').each do |line|
|
461
|
-
line.chomp!
|
462
|
-
(name,fasta_length,acc,db_name,status) = line.split("\t")
|
463
|
-
|
464
|
-
if (line !~ /^Query_id\t/) && (!line.empty?)
|
465
|
-
seqs_number += 1
|
466
|
-
|
467
|
-
if (fasta_length.to_i > longest_one)
|
468
|
-
longest_one = fasta_length.to_i
|
469
|
-
end
|
470
|
-
|
471
|
-
# -------------------------------------------------------------------------
|
472
|
-
if (fasta_length.to_i >= size_filter1)
|
473
|
-
uni_200 += 1
|
474
|
-
end
|
475
|
-
if (fasta_length.to_i >= size_filter2)
|
476
|
-
uni_500 += 1
|
477
|
-
end
|
478
|
-
# -------------------------------------------------------------------------
|
479
|
-
|
480
|
-
if (fasta_length.to_i > size_filter1)
|
481
|
-
if (status == 'coding')
|
482
|
-
coding_stats[2] += 1
|
483
|
-
elsif (status == 'putative_coding')
|
484
|
-
p_coding_stats[2] += 1
|
485
|
-
elsif (status == 'unknown')
|
486
|
-
unknown_stats[2] += 1
|
487
|
-
end
|
488
|
-
end
|
489
|
-
|
490
|
-
if (fasta_length.to_i > size_filter2)
|
491
|
-
if (status == 'coding')
|
492
|
-
coding_stats[3] += 1
|
493
|
-
elsif (status == 'putative_coding')
|
494
|
-
p_coding_stats[3] += 1
|
495
|
-
elsif (status == 'unknown')
|
496
|
-
unknown_stats[3] += 1
|
497
|
-
end
|
498
|
-
end
|
499
|
-
|
500
|
-
if (status == 'coding')
|
501
|
-
coding_stats[0] += 1
|
502
|
-
elsif (status == 'putative_coding')
|
503
|
-
p_coding_stats[0] += 1
|
504
|
-
elsif (status == 'unknown')
|
505
|
-
unknown_stats[0] += 1
|
506
|
-
end
|
367
|
+
def body_reptrans(html_file, stats_hash)
|
368
|
+
html_file.puts '<body bgcolor="#FFFFFF" >', '<center>' # Start body
|
369
|
+
html_file.puts '<div style="float:center; font-size:30; margin:10px"><b>', 'Full-LengtherNEXT Representative Transcriptome Summary', '</b></div>'
|
370
|
+
# TABLES
|
371
|
+
html_file.puts '<div style=" width: 850px; height: 350px; padding: 10 ">'
|
372
|
+
reptrans_report(html_file, stats_hash, 'left')
|
373
|
+
reptrans_acumulative_report(html_file, stats_hash, 'rigth')
|
374
|
+
html_file.puts '</div>'
|
375
|
+
# END TABLES
|
376
|
+
html_file.puts '</center>', '</body>' # End body
|
377
|
+
end
|
507
378
|
|
508
|
-
|
379
|
+
def body(html_file, stats_hash)
|
380
|
+
html_file.puts '<body bgcolor="#FFFFFF" onload="showDemo(); id=demo">', '<center>' # Start body
|
381
|
+
html_file.puts '<div style="float:center; font-size:30; margin:10px"><b>', 'Full-LengtherNEXT Summary', '</b></div>'
|
382
|
+
|
383
|
+
# TABLES
|
384
|
+
html_file.puts '<div style="overflow: hidden; width: 950px; height: 550px; padding: 10 ">'
|
385
|
+
general_report(html_file, stats_hash, 'left')
|
386
|
+
assembly_report(html_file, stats_hash, 'right')
|
387
|
+
html_file.puts '</div>'
|
388
|
+
html_file.puts '<div style="overflow: hidden; width: 950px; height: 550px; padding: 10 ">'
|
389
|
+
status_graph(html_file, 'left')
|
390
|
+
status_report(html_file, stats_hash, 'rigth')
|
391
|
+
html_file.puts '</div>'
|
392
|
+
html_file.puts '<div style="overflow: hidden; width: 950px; height: 750px; padding: 10 ">'
|
393
|
+
taxonomy_graph(html_file, 'left')
|
394
|
+
database_report(html_file, stats_hash, 'rigth')
|
395
|
+
html_file.puts '</div>'
|
396
|
+
# END TABLES
|
397
|
+
html_file.puts '</center>', '</body>' # End body
|
398
|
+
end
|
509
399
|
|
510
|
-
end
|
511
400
|
|
512
|
-
status_array = [coding_stats, p_coding_stats, unknown_stats]
|
513
401
|
|
514
|
-
|
515
|
-
|
402
|
+
def reptrans_report(html_file, stats_hash, align)
|
403
|
+
html = []
|
404
|
+
all_seqs = 0
|
405
|
+
stats_hash.values.map{|v| all_seqs += v}
|
406
|
+
html << '<div style=" margin: 0; float:'+align+'">'
|
407
|
+
html << table_title('Sequences info')
|
408
|
+
html.concat(table_header(['', 'Sequences', '%'], 0))
|
409
|
+
html.concat(single_row('Output', all_seqs, all_seqs))
|
410
|
+
html.concat(single_row('Annotated with protein', stats_hash['prot_annotated'], all_seqs))
|
411
|
+
html.concat(single_row('Annotated with EST', stats_hash['est_annotated'], all_seqs))
|
412
|
+
html.concat(single_row('Coding test-code > 1', stats_hash['coding_>1'], all_seqs))
|
413
|
+
html.concat(single_row('Coding test-code > 0.94', stats_hash['coding_>0.94'], all_seqs))
|
414
|
+
html.concat(single_row('Coding test-code > 0.84', stats_hash['coding_>0.84'], all_seqs))
|
415
|
+
html.concat(single_row('Coding test-code > 0.73', stats_hash['coding_>0.73'], all_seqs))
|
416
|
+
html.concat(single_row('Coding test-code > 0', stats_hash['coding_>0'], all_seqs))
|
417
|
+
html << '</table>'
|
418
|
+
html << '</div>'
|
419
|
+
write_array_html(html, html_file)
|
420
|
+
end
|
516
421
|
|
422
|
+
def reptrans_acumulative_report(html_file, stats_hash, align)
|
423
|
+
html = []
|
424
|
+
all_seqs = 0
|
425
|
+
stats_hash.values.map{|v| all_seqs += v}
|
426
|
+
html << '<div style=" margin: 0; float:'+align+'">'
|
427
|
+
html << table_title('Sequences summary (Acumulative)')
|
428
|
+
html.concat(table_header(['', 'Sequences', '%'], 0))
|
429
|
+
acumulative = 0
|
430
|
+
html.concat(single_row('Annotated with protein', stats_hash['prot_annotated'], all_seqs))
|
431
|
+
acumulative += stats_hash['prot_annotated']
|
432
|
+
html.concat(single_row('Annotated with EST', stats_hash['est_annotated'] + acumulative, all_seqs))
|
433
|
+
acumulative += stats_hash['est_annotated']
|
434
|
+
html.concat(single_row('Coding test-code > 1', stats_hash['coding_>1'] + acumulative, all_seqs))
|
435
|
+
acumulative += stats_hash['coding_>1']
|
436
|
+
html.concat(single_row('Coding test-code > 0.94', stats_hash['coding_>0.94'] + acumulative, all_seqs))
|
437
|
+
acumulative += stats_hash['coding_>0.94']
|
438
|
+
html.concat(single_row('Coding test-code > 0.84', stats_hash['coding_>0.84'] + acumulative, all_seqs))
|
439
|
+
acumulative += stats_hash['coding_>0.84']
|
440
|
+
html.concat(single_row('Coding test-code > 0.73', stats_hash['coding_>0.73'] + acumulative, all_seqs))
|
441
|
+
html << '</table>'
|
442
|
+
html << '</div>'
|
443
|
+
write_array_html(html, html_file)
|
444
|
+
end
|
445
|
+
|
446
|
+
def general_report(html_file, stats_hash, align)
|
447
|
+
html = []
|
448
|
+
html << '<div style="margin: 0; float:'+align+'">'
|
449
|
+
html << table_title('General info')
|
450
|
+
html.concat(table_header(['', 'Sequences', '%'], 0))
|
451
|
+
html.concat(single_row('Input', stats_hash['input_seqs'], stats_hash['input_seqs']))
|
452
|
+
html.concat(single_row('Failing sequences', stats_hash['failed'], stats_hash['output_seqs']))
|
453
|
+
html.concat(single_row('Artifacts <sup>1</sup>', stats_hash['artifacts'], stats_hash['output_seqs']))
|
454
|
+
html.concat(single_row('Misassembled', stats_hash['misassembled'], stats_hash['artifacts'], TRUE))
|
455
|
+
html.concat(single_row('Chimeras', stats_hash['chimeras'], stats_hash['artifacts'], TRUE))
|
456
|
+
html.concat(single_row('Other', stats_hash['other_artifacts'], stats_hash['artifacts'], TRUE))
|
457
|
+
html.concat(single_row('Sequences with resolved chimeras', stats_hash['output_seqs'], stats_hash['input_seqs']))
|
458
|
+
html.concat(single_row('Sequences without artifacts', stats_hash['good_seqs'], stats_hash['output_seqs']))
|
459
|
+
html.concat(single_row('BA index', "%5.2f" % [stats_hash['BA_index']], nil)) if stats_hash['BA_index'] > 0
|
460
|
+
html << '</table>'
|
461
|
+
html << '</div>'
|
462
|
+
write_array_html(html, html_file)
|
463
|
+
end
|
517
464
|
|
518
|
-
|
465
|
+
def taxonomy_graph(html_file, align)
|
466
|
+
html_file.puts '<div style=\'float:'+align+'\'>'
|
467
|
+
html_file.puts table_title('Taxonomy distribution on annotations')
|
468
|
+
html_file.puts '<table >
|
469
|
+
<tr>
|
470
|
+
<td>
|
471
|
+
<canvas id=\'taxonomy\' width=\'540\' height=\'640\'></canvas>
|
472
|
+
</td>
|
473
|
+
</tr>
|
474
|
+
</table>
|
475
|
+
</div>'
|
476
|
+
|
477
|
+
end
|
519
478
|
|
520
|
-
uni_500 = 0
|
521
|
-
uni_200 = 0
|
522
|
-
nc_total = 0
|
523
|
-
longest_one = 0
|
524
479
|
|
525
|
-
|
526
|
-
|
527
|
-
|
480
|
+
def database_report(html_file, stats_hash, align)
|
481
|
+
html = []
|
482
|
+
html << '<div style=" margin: 0 float:'+align+'">'
|
483
|
+
html << table_title('Database usage')
|
484
|
+
html.concat(table_header(['', 'Unigenes', '%'], 0))
|
485
|
+
html.concat(single_row('UserDB', stats_hash['userdb'], stats_hash['good_seqs']))
|
486
|
+
html.concat(single_row('SwissProt', stats_hash['swissprot'], stats_hash['good_seqs']))
|
487
|
+
html.concat(single_row('TrEMBL', stats_hash['trembl'], stats_hash['good_seqs']))
|
488
|
+
html.concat(single_row('ncRNA', stats_hash['ncrna'], stats_hash['good_seqs']))
|
489
|
+
html.concat(single_row('None', stats_hash['coding']+ stats_hash['unknown'], stats_hash['good_seqs']))
|
490
|
+
html.concat(single_row('Total', stats_hash['good_seqs'], stats_hash['good_seqs']))
|
491
|
+
html << '</table>'
|
492
|
+
html << '</div>'
|
493
|
+
write_array_html(html, html_file)
|
494
|
+
end
|
528
495
|
|
529
|
-
|
496
|
+
def assembly_report(html_file, stats_hash, align)
|
497
|
+
html = []
|
498
|
+
html << '<div style=" margin: 0; float:'+align+'">'
|
499
|
+
html << table_title('Report guiding assembly quality')
|
500
|
+
html.concat(table_header(['', 'Unigenes', '%'], 0))
|
501
|
+
html.concat(single_row('Unigenes', stats_hash['good_seqs'], stats_hash['good_seqs']))
|
502
|
+
html.concat(single_row('Unigenes >500pb', stats_hash['sequences_>500'], stats_hash['good_seqs']))
|
503
|
+
html.concat(single_row('Unigenes >200pb', stats_hash['sequences_>200'], stats_hash['good_seqs']))
|
504
|
+
html.concat(single_row('Longest unigene', stats_hash['longest_unigene'], nil))
|
505
|
+
html.concat(single_row('With orthologue <sup>1</sup>', stats_hash['prot_annotated'], stats_hash['good_seqs']))
|
506
|
+
html.concat(single_row('Different orthologue IDs', stats_hash['different_orthologues'], stats_hash['prot_annotated'], TRUE))
|
507
|
+
html.concat(single_row('Complete transcripts', stats_hash['complete'], stats_hash['prot_annotated'], TRUE))
|
508
|
+
html.concat(single_row('Different complete transcripts ', stats_hash['different_completes'], stats_hash['prot_annotated'], TRUE))
|
509
|
+
html.concat(single_row('ncRNA', stats_hash['ncrna'], stats_hash['good_seqs']))
|
510
|
+
without_orthologue = stats_hash['coding']+ stats_hash['unknown']
|
511
|
+
html.concat(single_row('Without orthologue <sup>1</sup>', without_orthologue, stats_hash['good_seqs']))
|
512
|
+
html.concat(single_row('Coding (all)', stats_hash['coding'], without_orthologue, TRUE))
|
513
|
+
html.concat(single_row('Coding > 200bp', stats_hash['coding_>200'], without_orthologue, TRUE))
|
514
|
+
html.concat(single_row('Coding > 500bp', stats_hash['coding_>500'], without_orthologue, TRUE))
|
515
|
+
html.concat(single_row('Unknown (all)', stats_hash['unknown'], without_orthologue, TRUE))
|
516
|
+
html.concat(single_row('Unknown > 200bp', stats_hash['unknown_>200'], without_orthologue, TRUE))
|
517
|
+
html.concat(single_row('Unknown > 500bp', stats_hash['unknown_>500'], without_orthologue, TRUE))
|
518
|
+
html << '</table>'
|
519
|
+
html << '<sup>1</sup> Percents for subclassifications of this category <br> were calculated using this line as 100% reference.'
|
520
|
+
html << '</div>'
|
521
|
+
write_array_html(html, html_file)
|
522
|
+
end
|
530
523
|
|
531
|
-
|
532
|
-
|
533
|
-
|
534
|
-
|
535
|
-
|
536
|
-
|
537
|
-
|
538
|
-
|
539
|
-
|
540
|
-
|
541
|
-
|
524
|
+
def status_graph(html_file, align)
|
525
|
+
html_file.puts '<div style=\'float:'+align+'\'>'
|
526
|
+
html_file.puts table_title('Structural profile')
|
527
|
+
html_file.puts '<table >
|
528
|
+
<tr>
|
529
|
+
<td>
|
530
|
+
<canvas id=\'profile\' width=\'500\' height=\'440\'></canvas>
|
531
|
+
</td>
|
532
|
+
</tr>
|
533
|
+
</table>
|
534
|
+
</div>'
|
542
535
|
|
543
|
-
|
536
|
+
end
|
544
537
|
|
545
|
-
|
538
|
+
def status_report(html_file, stats_hash, align)
|
539
|
+
html = []
|
540
|
+
html << '<div style=" margin: 0; float:'+align+'">'
|
541
|
+
html << table_title('Status report')
|
542
|
+
html.concat(table_header(['Status', 'Unigenes', '%'], 2))
|
543
|
+
html.concat(fused_row('Complete', stats_hash['complete_sure'], stats_hash['complete_putative'], stats_hash['good_seqs']))
|
544
|
+
html.concat(fused_row('C-terminus', stats_hash['c_terminal_sure'], stats_hash['c_terminal_putative'], stats_hash['good_seqs']))
|
545
|
+
html.concat(fused_row('N-terminus', stats_hash['n_terminal_sure'], stats_hash['n_terminal_putative'], stats_hash['good_seqs']))
|
546
|
+
html.concat(composed_single_row('Internal', stats_hash['internal'], stats_hash['good_seqs']))
|
547
|
+
html.concat(fused_row('Coding', stats_hash['coding_sure'], stats_hash['coding_putative'], stats_hash['good_seqs']))
|
548
|
+
html.concat(composed_single_row('ncRNA', stats_hash['ncrna'], stats_hash['good_seqs']))
|
549
|
+
html.concat(composed_single_row('Unknown', stats_hash['unknown'], stats_hash['good_seqs']))
|
550
|
+
html.concat(composed_single_row('Total', stats_hash['good_seqs'], stats_hash['good_seqs']))
|
551
|
+
html << '</table>'
|
552
|
+
html << '</div>'
|
553
|
+
write_array_html(html, html_file)
|
554
|
+
end
|
555
|
+
|
556
|
+
|
557
|
+
def table_title(title)
|
558
|
+
html = '<div style="font-size:25px; margin: 10"><b>'+title+'</b></div>'
|
559
|
+
return html
|
560
|
+
end
|
561
|
+
|
562
|
+
def table_header(col_array, colspan)
|
563
|
+
html = []
|
564
|
+
|
565
|
+
html << '<table border="2" cellspacing="0" cellpadding="2">'
|
566
|
+
# Table header
|
567
|
+
html << '<tr>'
|
568
|
+
col_array.each_with_index do |col,i|
|
569
|
+
if i == 0 && colspan > 0
|
570
|
+
html << '<th colspan="'+colspan.to_s+'">'+col+'</th>'
|
571
|
+
else
|
572
|
+
html << '<th>'+col+'</th>'
|
546
573
|
end
|
574
|
+
end
|
575
|
+
html << '</tr>'
|
576
|
+
return html
|
577
|
+
end
|
547
578
|
|
548
|
-
|
579
|
+
def single_row(name, magnitude, total, space = FALSE)
|
580
|
+
if space
|
581
|
+
name = ' '+ name
|
549
582
|
end
|
583
|
+
html = []
|
584
|
+
html << '<tr>'
|
585
|
+
html << '<td align="left">'+name+'</td>'
|
586
|
+
html.concat(sub_row(magnitude, total))
|
587
|
+
html << '</tr>'
|
588
|
+
return html
|
589
|
+
end
|
550
590
|
|
551
|
-
|
552
|
-
|
553
|
-
|
554
|
-
|
555
|
-
|
556
|
-
|
557
|
-
|
558
|
-
|
559
|
-
|
560
|
-
|
591
|
+
|
592
|
+
def fused_row(type, sure_magnitude, putative_magnitude, total)
|
593
|
+
html = []
|
594
|
+
html << '<td rowspan="2" align="left">'+type+'</td>'
|
595
|
+
html << seq_status('Sure')
|
596
|
+
html.concat(sub_row(sure_magnitude, total))
|
597
|
+
html << '</tr>'
|
598
|
+
html << '<tr>'
|
599
|
+
html << seq_status('Putative')
|
600
|
+
html.concat(sub_row(putative_magnitude, total))
|
601
|
+
html << '</tr>'
|
602
|
+
return html
|
603
|
+
end
|
604
|
+
|
605
|
+
def seq_status(status)
|
606
|
+
html = '<td align="left">'+status+'</td>'
|
607
|
+
return html
|
608
|
+
end
|
609
|
+
|
610
|
+
def sub_row(magnitude, total)
|
611
|
+
if !total.nil?
|
612
|
+
perc_float = magnitude*100.0/total
|
613
|
+
if !perc_float.nan?
|
614
|
+
percentage = '%.2f' % perc_float.to_s
|
615
|
+
percentage += '%'
|
561
616
|
else
|
562
|
-
|
563
|
-
line.chomp!
|
564
|
-
if (!line.empty?)
|
565
|
-
(name,fasta_length,acc,db_name,status) = line.split("\t")
|
566
|
-
if (status == 'Putative chimera')
|
567
|
-
if (fasta_length.to_i > longest_one)
|
568
|
-
longest_one = fasta_length.to_i
|
569
|
-
end
|
570
|
-
# -------------------------------------------------------------------------
|
571
|
-
if (fasta_length.to_i >= size_filter1)
|
572
|
-
uni_200 += 1
|
573
|
-
end
|
574
|
-
if (fasta_length.to_i >= size_filter2)
|
575
|
-
uni_500 += 1
|
576
|
-
end
|
577
|
-
# -------------------------------------------------------------------------
|
578
|
-
if (db_name =~ /^sp_/)
|
579
|
-
db_usage[1] += 1
|
580
|
-
elsif (db_name =~ /^tr_/)
|
581
|
-
db_usage[2] += 1
|
582
|
-
else
|
583
|
-
db_usage[0] += 1
|
584
|
-
end
|
585
|
-
# -------------------------------------------------------------------------
|
586
|
-
ch_total += 1
|
587
|
-
end
|
588
|
-
end
|
589
|
-
end
|
590
|
-
|
591
|
-
db_usage.each_with_index do |db,i|
|
592
|
-
db_usage[i] = db/2
|
593
|
-
end
|
594
|
-
|
595
|
-
return [(ch_total/2), (uni_500/2), (uni_200/2), longest_one, db_usage]
|
617
|
+
percentage ='-'
|
596
618
|
end
|
597
|
-
|
619
|
+
else
|
620
|
+
percentage = '-'
|
621
|
+
end
|
622
|
+
html = []
|
623
|
+
html << '<td align="right">'+magnitude.to_s+'</td>'
|
624
|
+
html << '<td align="right">'+percentage+'</td>'
|
625
|
+
return html
|
626
|
+
end
|
627
|
+
|
628
|
+
def composed_single_row(type, magnitude, total)
|
629
|
+
html = []
|
630
|
+
html << '<tr>'
|
631
|
+
html << '<td colspan="2" align="left">'+type+'</td>'
|
632
|
+
html.concat(sub_row(magnitude, total))
|
633
|
+
html << '</tr>'
|
634
|
+
return html
|
635
|
+
end
|
598
636
|
|
599
637
|
|
638
|
+
def write_array_html(html, html_file)
|
639
|
+
html.map{|line| html_file.puts line}
|
640
|
+
end
|
600
641
|
end
|