full_lengther_next 0.0.8 → 0.5.6

Sign up to get free protection for your applications and to get access to all the features.
Files changed (36) hide show
  1. data/.gemtest +0 -0
  2. data/History.txt +2 -2
  3. data/Manifest.txt +33 -18
  4. data/Rakefile +4 -2
  5. data/bin/download_fln_dbs.rb +310 -158
  6. data/bin/full_lengther_next +160 -103
  7. data/bin/make_test_dataset.rb +236 -0
  8. data/bin/make_user_db.rb +101 -117
  9. data/bin/plot_fln.rb +270 -0
  10. data/bin/plot_taxonomy.rb +70 -0
  11. data/lib/expresscanvas.zip +0 -0
  12. data/lib/full_lengther_next.rb +3 -3
  13. data/lib/full_lengther_next/classes/artifacts.rb +66 -0
  14. data/lib/full_lengther_next/classes/blast_functions.rb +326 -0
  15. data/lib/full_lengther_next/classes/cdhit.rb +154 -0
  16. data/lib/full_lengther_next/classes/chimeric_seqs.rb +315 -57
  17. data/lib/full_lengther_next/classes/common_functions.rb +105 -63
  18. data/lib/full_lengther_next/classes/exonerate_result.rb +258 -0
  19. data/lib/full_lengther_next/classes/fl_analysis.rb +226 -617
  20. data/lib/full_lengther_next/classes/fl_string_utils.rb +4 -2
  21. data/lib/full_lengther_next/classes/fln_stats.rb +598 -557
  22. data/lib/full_lengther_next/classes/handle_db.rb +30 -0
  23. data/lib/full_lengther_next/classes/my_worker.rb +308 -138
  24. data/lib/full_lengther_next/classes/my_worker_EST.rb +54 -0
  25. data/lib/full_lengther_next/classes/my_worker_manager_EST.rb +69 -0
  26. data/lib/full_lengther_next/classes/my_worker_manager_fln.rb +389 -0
  27. data/lib/full_lengther_next/classes/nc_rna.rb +5 -7
  28. data/lib/full_lengther_next/classes/reptrans.rb +210 -0
  29. data/lib/full_lengther_next/classes/sequence.rb +439 -80
  30. data/lib/full_lengther_next/classes/test_code.rb +15 -16
  31. data/lib/full_lengther_next/classes/types.rb +12 -0
  32. data/lib/full_lengther_next/classes/une_los_hit.rb +148 -230
  33. data/lib/full_lengther_next/classes/warnings.rb +40 -0
  34. metadata +207 -93
  35. data/lib/full_lengther_next/classes/lcs.rb +0 -33
  36. data/lib/full_lengther_next/classes/my_worker_manager.rb +0 -240
@@ -5,7 +5,7 @@ class String
5
5
  s = self.upcase
6
6
  a = s.split('').each_slice(3).map{|e| e.join}
7
7
 
8
- c={'GCT'=>'A', 'GCC'=>'A','GCA'=>'A','GCG'=>'A',
8
+ c={ 'GCT'=>'A','GCC'=>'A','GCA'=>'A','GCG'=>'A',
9
9
  'CGT'=>'R','CGC'=>'R','CGA'=>'R','CGG'=>'R','AGA'=>'R','AGG'=>'R',
10
10
  'AAT'=>'N','AAC'=>'N',
11
11
  'GAT'=>'D','GAC'=>'D',
@@ -37,6 +37,8 @@ class String
37
37
  else
38
38
  c[e]||'x'
39
39
  end
40
+ else
41
+ 'x'
40
42
  end
41
43
  }
42
44
  return res.compact.join
@@ -136,4 +138,4 @@ class String
136
138
  return self.reverse.split('').map{|e| c[e]}.join
137
139
  end
138
140
 
139
- end
141
+ end
@@ -1,600 +1,641 @@
1
+ require 'types.rb'
1
2
 
2
3
  module FlnStats
3
-
4
- def summary_stats
5
- stats_file = File.open('fln_results/summary_stats.html', 'w')
6
-
7
- size_filter1 = 200
8
- size_filter2 = 500
9
-
10
- # recogemos los trozos de html fijos
11
- (html_head, html_st, html_uni, html_db, html_as, html_end) = html_code
12
-
13
- total_seqs = 0
14
- status_suma = 0
15
- #recogemos los datos que necesitamos de los ficheros de resultados
16
- (status_array, db_usage, seqs_number1, error_1_num, seq_uniq, complete_uniq, db_uni_500, db_uni_200, db_longest_one) = annotation_stats(size_filter1,size_filter2)
17
- (tcode_array, seqs_number2, tc_uni_500, tc_uni_200, tc_longest_one) = testcode_stats(size_filter1,size_filter2)
18
- (ncrna_total, nc_uni_500, nc_uni_200, nc_longest_one)=ncrna_stats(size_filter1,size_filter2)
19
- (chimera_total, ch_uni_500, ch_uni_200, ch_longest_one, ch_db_usage)=chimera_stats(size_filter1,size_filter2)
20
-
21
- seqs_number1 = (seqs_number1+chimera_total.to_i)
22
- total_seqs = (seqs_number1 + seqs_number2 + ncrna_total.to_i)
23
- uni_500 = (db_uni_500 + tc_uni_500 + nc_uni_500 + ch_uni_500)
24
- uni_200 = (db_uni_200 + tc_uni_200 + nc_uni_200 + ch_uni_200)
25
- longest_one = [db_longest_one, tc_longest_one, nc_longest_one, ch_longest_one].max
26
- db_usage[0] += ch_db_usage[0]
27
- db_usage[1] += ch_db_usage[1]
28
- db_usage[2] += ch_db_usage[2]
29
- stats_file.puts html_head
30
-
31
- if (total_seqs.to_i > 0)
32
- # imprimimos la tabla Status Report --------------------------------------------------------------------------------------------
33
- stats_file.puts html_st
34
- status_array.each do |status|
35
- if (status[1] == 'Internal') || (status[1] == 'Misassembled')
36
- stats_file.puts ' <tr>
37
- <td colspan="2" align="left">'+status[1].to_s+'</td>
38
- <td align="right">'+status[0].to_s+'</td>
39
- <td align="right">'+'%.2f' % (100*status[0].to_f/total_seqs.to_f).to_s+' %</td>
40
- </tr>'
41
- elsif (status[1] =~ /^Putative/)
42
- stats_file.puts ' <tr>
43
- <td align="left">Putative</td>
44
- <td align="right">'+status[0].to_s+'</td>
45
- <td align="right">'+'%.2f' % (100*status[0].to_f/total_seqs.to_f).to_s+' %</td>
46
- </tr>'
47
- else
48
- stats_file.puts ' <tr>
49
- <td rowspan="2" align="left">'+status[1].to_s+'</td>
50
- <td align="left">Sure</td>
51
- <td align="right">'+status[0].to_s+'</td>
52
- <td align="right">'+'%.2f' % (100*status[0].to_f/total_seqs.to_f).to_s+' %</td>
53
- </tr>'
54
- end
55
- status_suma += status[0]
56
- end
57
-
58
- # adding chimeric seqs
59
- stats_file.puts ' <tr>
60
- <td colspan="2" align="left">Putative chimera</td>
61
- <td align="right">'+chimera_total.to_s+'</td>
62
- <td align="right">'+'%.2f' % (100*chimera_total.to_f/total_seqs.to_f).to_s+' %</td>
63
- </tr>'
64
- status_suma += chimera_total
65
-
66
- # añadimos los coding, P.coding
67
- tcode_array.each do |status|
68
- if (status[1] == 'Coding')
69
- stats_file.puts ' <tr>
70
- <td rowspan="2" align="left">'+status[1].to_s+'</td>
71
- <td align="left">Sure</td>
72
- <td align="right">'+status[0].to_s+'</td>
73
- <td align="right">'+'%.2f' % (100*status[0].to_f/total_seqs.to_f).to_s+' %</td>
74
- </tr>'
75
- elsif (status[1] == 'Putative Coding')
76
- stats_file.puts ' <tr>
77
- <td align="left">Putative</td>
78
- <td align="right">'+status[0].to_s+'</td>
79
- <td align="right">'+'%.2f' % (100*status[0].to_f/total_seqs.to_f).to_s+' %</td>
80
- </tr>'
4
+ def initialize_stats_hash
5
+ stats_hash = {}
6
+ stats_hash['input_seqs'] = 0
7
+ stats_hash['output_seqs'] = 0
8
+ stats_hash['failed'] = 0
9
+ stats_hash['sequences_>200'] = 0
10
+ stats_hash['sequences_>500'] = 0
11
+ stats_hash['longest_unigene'] = 0
12
+ stats_hash['good_seqs'] = 0
13
+ stats_hash['artifacts'] = 0
14
+ stats_hash['misassembled'] = 0
15
+ stats_hash['chimeras'] = 0
16
+ stats_hash['other_artifacts'] = 0
17
+ stats_hash['unknown'] = 0
18
+ stats_hash['unknown_>200'] = 0
19
+ stats_hash['unknown_>500'] = 0
20
+ stats_hash['prot_annotated'] = 0
21
+ stats_hash['complete'] = 0
22
+ stats_hash['complete_sure'] = 0
23
+ stats_hash['complete_putative'] = 0
24
+ stats_hash['n_terminal'] = 0
25
+ stats_hash['n_terminal_sure'] = 0
26
+ stats_hash['n_terminal_putative'] = 0
27
+ stats_hash['c_terminal'] = 0
28
+ stats_hash['c_terminal_sure'] = 0
29
+ stats_hash['c_terminal_putative'] = 0
30
+ stats_hash['internal'] = 0
31
+ stats_hash['swissprot'] = 0
32
+ stats_hash['trembl'] = 0
33
+ stats_hash['userdb'] = 0
34
+ stats_hash['ncrna'] = 0
35
+ stats_hash['coding'] = 0
36
+ stats_hash['coding_sure'] = 0
37
+ stats_hash['coding_putative'] = 0
38
+ stats_hash['coding_>200'] = 0
39
+ stats_hash['coding_>500'] = 0
40
+ stats_hash['different_orthologues'] = 0
41
+ stats_hash['different_completes'] = 0
42
+ stats_hash['BA_index'] = 0
43
+
44
+ return stats_hash
45
+ end
46
+
47
+ def get_taxonomy(name, taxonomy)
48
+ organism = nil
49
+ if name.include?('OS=')
50
+ fields = name.split('OS=',2)
51
+ organism = fields.last.split(' GN=').first.strip
52
+ elsif name[0..2] = 'sp=' || name[0..2] = 'tr='
53
+ name =~ /(\w+ \w+) \(([\w ]+)\) \(([\w ]+)\)/
54
+ if !$1.nil?
55
+ organism = $1
56
+ else
57
+ name =~ /(\w+ \w+) \(([\w ]+)\)/
58
+ if !$1.nil?
59
+ organism = $1
81
60
  end
82
- status_suma += status[0]
83
61
  end
84
-
85
- # se ponen los ncRNA
86
- stats_file.puts ' <tr>
87
- <td colspan="2" align="left">Putative ncRNA</td>
88
- <td align="right">'+ncrna_total.to_s+'</td>
89
- <td align="right">'+'%.2f' % (100*ncrna_total.to_f/total_seqs.to_f).to_s+' %</td>
90
- </tr>'
91
- status_suma += ncrna_total
92
- # se ponen los unknown
93
- tcode_array.each do |status|
94
- if (status[1] =~ /Unknown/i)
95
- stats_file.puts ' <tr>
96
- <td colspan="2" align="left">'+status[1].to_s+'</td>
97
- <td align="right">'+status[0].to_s+'</td>
98
- <td align="right">'+'%.2f' % (100*status[0].to_f/total_seqs.to_f).to_s+' %</td>
99
- </tr>'
100
- end
62
+ else
63
+ organism = name.split(";",2).last
64
+ organism = organism.split('.', 2).first
65
+ organism.gsub!(/\(\D+\)/,'')
66
+ if organism.split(' ').length > 1
67
+ organism.gsub!('.','')
68
+ organism.gsub!(/^ /,'')
69
+ organism.gsub!(' ','')
70
+ organism.strip!
101
71
  end
102
- #se añade el total
103
- stats_file.puts ' <tr>
104
- <td colspan="2" align="left">Total</td>
105
- <td align="right">'+status_suma.to_s+'</td>
106
- <td align="right">'+'%.2f' % (100*status_suma.to_f/total_seqs.to_f).to_s+' %</td>
107
- </tr>
108
- </table>'
109
-
110
-
111
- # imprimimos la tabla Unigene Report --------------------------------------------------------------------------------------------
112
- new_genes = tcode_array[0][0] + tcode_array[1][0]
113
- total_uni = (seqs_number1 + new_genes + ncrna_total + tcode_array[2][0])
114
- stats_file.puts html_uni
115
- stats_file.puts ' <tr>
116
- <td align="left">With orthologue in DBs</td>
117
- <td align="right">'+seqs_number1.to_s+'</td>
118
- <td align="right">'+'%.2f' % (100*seqs_number1.to_f/total_seqs.to_f).to_s+' %</td>
119
- </tr>'
120
- stats_file.puts ' <tr>
121
- <td align="left">Putative New Genes</td>
122
- <td align="right">'+new_genes.to_s+'</td>
123
- <td align="right">'+'%.2f' % (100*new_genes.to_f/total_seqs.to_f).to_s+' %</td>
124
- </tr>'
125
- stats_file.puts ' <tr>
126
- <td align="left">ncRNAs</td>
127
- <td align="right">'+ncrna_total.to_s+'</td>
128
- <td align="right">'+'%.2f' % (100*ncrna_total.to_f/total_seqs.to_f).to_s+' %</td>
129
- </tr>'
130
- stats_file.puts ' <tr>
131
- <td align="left">Unknown</td>
132
- <td align="right">'+tcode_array[2][0].to_s+'</td>
133
- <td align="right">'+'%.2f' % (100*tcode_array[2][0].to_f/total_seqs.to_f).to_s+' %</td>
134
- </tr>'
135
- stats_file.puts ' <tr>
136
- <td align="left">Total</td>
137
- <td align="right">'+total_uni.to_s+'</td>
138
- <td align="right">'+'%.2f' % (100*total_uni.to_f/total_seqs.to_f).to_s+' %</td>
139
- </tr>
140
- </table>'
72
+ end
73
+ if !organism.nil?
74
+ organism = organism.split(' ')[0..1].join(' ')
75
+ if taxonomy[organism].nil?
76
+ taxonomy[organism] = 1
77
+ else
78
+ taxonomy[organism] += 1
79
+ end
80
+ end
81
+ end
141
82
 
142
- # imprimimos la tabla Database Usage --------------------------------------------------------------------------------------------
143
- stats_file.puts html_db
144
- db_names=["UserDB", "SwissProt", "TrEMBL"]
145
- total_db = 0
146
-
147
- for i in 0..db_usage.length-1 do i
148
- total_db += db_usage[i]
149
- stats_file.puts ' <tr>
150
- <td align="left">'+db_names[i].to_s+'</td>
151
- <td align="right">'+db_usage[i].to_s+'</td>
152
- <td align="right">'+'%.2f' % (100*db_usage[i].to_f/total_seqs.to_f).to_s+' %</td>
153
- </tr>'
154
- end
155
- no_db = seqs_number2 + ncrna_total.to_i
156
- stats_file.puts ' <tr>
157
- <td align="left">None</td>
158
- <td align="right">'+no_db.to_s+'</td>
159
- <td align="right">'+'%.2f' % (100*no_db.to_f/total_seqs.to_f).to_s+' %</td>
160
- </tr>'
161
- total_db += no_db
162
- stats_file.puts ' <tr>
163
- <td align="left">Total</td>
164
- <td align="right">'+total_db.to_s+'</td>
165
- <td align="right">'+'%.2f' % (100*total_db.to_f/total_seqs.to_f).to_s+' %</td>
166
- </tr>
167
- </table>'
168
-
169
- # imprimimos la tabla Report guiding assembly quality -------------------------------------------------------------
170
- stats_file.puts html_as
171
- stats_file.puts ' <tr>
172
- <td align="left">Unigenes</td>
173
- <td align="right">'+total_seqs.to_s+'</td>
174
- <td align="right">'+'%.2f' % (100*total_seqs.to_f/total_seqs.to_f).to_s+' %</td>
175
- </tr>'
176
- stats_file.puts ' <tr>
177
- <td align="left">Unigenes >'+size_filter2.to_s+'pb</td>
178
- <td align="right">'+uni_500.to_s+'</td>
179
- <td align="right">'+'%.2f' % (100*uni_500.to_f/total_seqs.to_f).to_s+' %</td>
180
- </tr>'
181
- stats_file.puts ' <tr>
182
- <td align="left">Unigenes >'+size_filter1.to_s+'pb</td>
183
- <td align="right">'+uni_200.to_s+'</td>
184
- <td align="right">'+'%.2f' % (100*uni_200.to_f/total_seqs.to_f).to_s+' %</td>
185
- </tr>'
186
- stats_file.puts ' <tr>
187
- <td align="left">Longest unigene</td>
188
- <td align="right">'+longest_one.to_s+'</td>
189
- <td align="right">-</td>
190
- </tr>'
191
- stats_file.puts ' <tr>
192
- <td align="left">With orthologue <sup>1</sup></td>
193
- <td align="right">'+seqs_number1.to_s+'</td>
194
- <td align="right">'+'%.2f' % (100*seqs_number1.to_f/total_seqs.to_f).to_s+' %</td>
195
- </tr>'
196
-
197
- if (seqs_number1.to_i > 0)
198
- stats_file.puts ' <tr>
199
- <td align="left">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;Different orthologue IDs</td>
200
- <td align="right">'+seq_uniq.to_s+'</td>
201
- <td align="right">'+'%.2f' % (100*seq_uniq.to_f/seqs_number1.to_f).to_s+' %</td>
202
- </tr>'
203
- stats_file.puts ' <tr>
204
- <td align="left">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;Complete transcripts</td>
205
- <td align="right">'+status_array[0][0].to_s+'</td>
206
- <td align="right">'+'%.2f' % (100*status_array[0][0].to_f/seqs_number1.to_f).to_s+' %</td>
207
- </tr>'
208
- stats_file.puts ' <tr>
209
- <td align="left">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;Different complete transcripts</td>
210
- <td align="right">'+complete_uniq.to_s+'</td>
211
- <td align="right">'+'%.2f' % (100*complete_uniq.to_f/seqs_number1.to_f).to_s+' %</td>
212
- </tr>'
213
- stats_file.puts ' <tr>
214
- <td align="left">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;Misassembled</td>
215
- <td align="right">'+error_1_num.to_s+'</td>
216
- <td align="right">'+'%.2f' % (100*error_1_num.to_f/seqs_number1.to_f).to_s+' %</td>
217
- </tr>'
218
- stats_file.puts ' <tr>
219
- <td align="left">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;Putative chimera</td>
220
- <td align="right">'+chimera_total.to_s+'</td>
221
- <td align="right">'+'%.2f' % (100*chimera_total.to_f/seqs_number1.to_f).to_s+' %</td>
222
- </tr>'
223
- end
224
- stats_file.puts ' <tr>
225
- <td align="left">Without orthologue <sup>1</sup></td>
226
- <td align="right">'+no_db.to_s+'</td>
227
- <td align="right">'+'%.2f' % (100*seqs_number2.to_f/total_seqs.to_f).to_s+' %</td>
228
- </tr>'
229
-
230
- if (no_db.to_i > 0) && (seqs_number2.to_i > 0)
231
- stats_file.puts ' <tr>
232
- <td align="left">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;Coding (all)</td>
233
- <td align="right">'+tcode_array[0][0].to_s+'</td>
234
- <td align="right">'+'%.2f' % (100*tcode_array[0][0].to_f/no_db.to_f).to_s+' %</td>
235
- </tr>'
236
- stats_file.puts ' <tr>
237
- <td align="left">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;Coding > '+size_filter1.to_s+'bp</td>
238
- <td align="right">'+tcode_array[0][2].to_s+'</td>
239
- <td align="right">'+'%.2f' % (100*tcode_array[0][2].to_f/no_db.to_f).to_s+' %</td>
240
- </tr>'
241
- stats_file.puts ' <tr>
242
- <td align="left">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;Coding > '+size_filter2.to_s+'bp</td>
243
- <td align="right">'+tcode_array[0][3].to_s+'</td>
244
- <td align="right">'+'%.2f' % (100*tcode_array[0][3].to_f/no_db.to_f).to_s+' %</td>
245
- </tr>'
246
- stats_file.puts ' <tr>
247
- <td align="left">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;Putative Coding (all)</td>
248
- <td align="right">'+tcode_array[1][0].to_s+'</td>
249
- <td align="right">'+'%.2f' % (100*tcode_array[1][0].to_f/no_db.to_f).to_s+' %</td>
250
- </tr>'
251
- stats_file.puts ' <tr>
252
- <td align="left">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;Putative Coding > '+size_filter1.to_s+'bp</td>
253
- <td align="right">'+tcode_array[1][2].to_s+'</td>
254
- <td align="right">'+'%.2f' % (100*tcode_array[1][2].to_f/no_db.to_f).to_s+' %</td>
255
- </tr>'
256
- stats_file.puts ' <tr>
257
- <td align="left">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;Putative Coding > '+size_filter2.to_s+'bp</td>
258
- <td align="right">'+tcode_array[1][3].to_s+'</td>
259
- <td align="right">'+'%.2f' % (100*tcode_array[1][3].to_f/no_db.to_f).to_s+' %</td>
260
- </tr>'
261
- stats_file.puts ' <tr>
262
- <td align="left">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;Putative ncRNA</td>
263
- <td align="right">'+ncrna_total.to_s+'</td>
264
- <td align="right">'+'%.2f' % (100*ncrna_total.to_f/no_db.to_f).to_s+' %</td>
265
- </tr>'
266
- stats_file.puts ' <tr>
267
- <td align="left">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;Unknown (all)</td>
268
- <td align="right">'+tcode_array[2][0].to_s+'</td>
269
- <td align="right">'+'%.2f' % (100*tcode_array[2][0].to_f/no_db.to_f).to_s+' %</td>
270
- </tr>'
271
- stats_file.puts ' <tr>
272
- <td align="left">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;Unknown > '+size_filter1.to_s+'bp</td>
273
- <td align="right">'+tcode_array[2][2].to_s+'</td>
274
- <td align="right">'+'%.2f' % (100*tcode_array[2][2].to_f/no_db.to_f).to_s+' %</td>
275
- </tr>'
276
- stats_file.puts ' <tr>
277
- <td align="left">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;Unknown > '+size_filter2.to_s+'bp</td>
278
- <td align="right">'+tcode_array[2][3].to_s+'</td>
279
- <td align="right">'+'%.2f' % (100*tcode_array[2][3].to_f/no_db.to_f).to_s+' %</td>
280
- </tr>'
281
- end
282
- stats_file.puts ' </table>
283
- <sup>1</sup> Percents for subclassifications of this category were calculated using this line as 100% reference.'
284
-
83
+ def initialize_stats_hash_reptrans
84
+ stats_hash = {}
85
+ stats_hash['prot_annotated'] = 0
86
+ stats_hash['est_annotated'] = 0
87
+ stats_hash['coding_>1'] = 0
88
+ stats_hash['coding_>0.94'] = 0
89
+ stats_hash['coding_>0.84'] = 0
90
+ stats_hash['coding_>0.73'] = 0
91
+ stats_hash['coding_>0'] = 0
92
+ return stats_hash
93
+ end
94
+
95
+ def summary_stats(seqs, stats_hash, diff_ids_array, diff_ids_complete_array)
96
+ low_limit = 200
97
+ upper_limit = 500
98
+ #All seqs
99
+ #-----------
100
+ stats_hash['output_seqs'] += seqs.length
101
+ good_seqs = seqs.select{|s| s.type >= UNKNOWN}
102
+ stats_hash['good_seqs'] += good_seqs.length
103
+
104
+ #Longest_unigene
105
+ current_longest_unigene = seqs.map{|s| s.fasta_length}.max
106
+ if current_longest_unigene > stats_hash['longest_unigene']
107
+ stats_hash['longest_unigene'] = current_longest_unigene
285
108
  end
286
- stats_file.puts html_end
109
+
110
+ #Load ids
111
+ seqs.map{|s|
112
+ if s.type > UNKNOWN && s.type < NCRNA
113
+ diff_ids_array << s.hit.acc
114
+ end}
115
+ diff_ids_array.uniq!
116
+
117
+ #By Length
118
+ stats_hash['sequences_>200'] += good_seqs.select{|s| s.fasta_length > low_limit}.length
119
+ stats_hash['sequences_>500'] += good_seqs.select{|s| s.fasta_length > upper_limit}.length
120
+
121
+ stats_hash['failed'] += seqs.select{|s| s.type == FAILED}.length
122
+
123
+ #Unknown
124
+ #-----------------------------
125
+ all_unknown = seqs.select{|s| s.type == UNKNOWN}
126
+ stats_hash['unknown'] += all_unknown.length
127
+
128
+ #By Length
129
+ stats_hash['unknown_>200'] += all_unknown.select{|s| s.fasta_length > low_limit}.length
130
+ stats_hash['unknown_>500'] += all_unknown.select{|s| s.fasta_length > upper_limit}.length
131
+
132
+ #Artifacts
133
+ #----------------
134
+ stats_hash['artifacts'] += seqs.select{|s| s.type < UNKNOWN && s.type > FAILED}.length
135
+ stats_hash['misassembled'] += seqs.select{|s| s.type == MISASSEMBLED}.length
136
+ stats_hash['chimeras'] += seqs.select{|s| s.type == CHIMERA && !s.seq_name.include?('_split_')}.length # We don't want count a multiple chimera
137
+ stats_hash['other_artifacts'] += seqs.select{|s| s.type == OTHER}.length
287
138
 
288
- stats_file.close
139
+ #Annotated with prot
140
+ #---------------------
141
+ prot_annotated = seqs.select{|s| s.type >= COMPLETE && s.type <= INTERNAL}
142
+ stats_hash['prot_annotated'] += prot_annotated.length
143
+
144
+ #By annotation
145
+ stats_hash['internal'] += seqs.select{|s| s.type == INTERNAL}.length
146
+ complete = seqs.select{|s| s.type == COMPLETE}
147
+ n_terminal = seqs.select{|s| s.type == N_TERMINAL}
148
+ c_terminal = seqs.select{|s| s.type == C_TERMINAL}
149
+
150
+ stats_hash['complete'] += complete.length
151
+ stats_hash['n_terminal'] += n_terminal.length
152
+ stats_hash['c_terminal'] += c_terminal.length
153
+
154
+ #Load complete ids
155
+ complete.map{|s| diff_ids_complete_array << s.hit.acc}
156
+ diff_ids_complete_array.uniq!
157
+
158
+ #----> By Status
159
+ stats_hash['complete_sure'] += complete.select{|s| s.status}.length
160
+ stats_hash['n_terminal_sure'] += n_terminal.select{|s| s.status}.length
161
+ stats_hash['c_terminal_sure'] += c_terminal.select{|s| s.status}.length
162
+ stats_hash['complete_putative'] += complete.select{|s| !s.status}.length
163
+ stats_hash['n_terminal_putative'] += n_terminal.select{|s| !s.status}.length
164
+ stats_hash['c_terminal_putative'] += c_terminal.select{|s| !s.status}.length
165
+
166
+ #By database
167
+ swissprot = prot_annotated.select{|s| s.db_name =~ /^sp_/}.length
168
+ trembl = prot_annotated.select{|s| s.db_name =~ /^tr_/}.length
169
+ stats_hash['swissprot'] += swissprot
170
+ stats_hash['trembl'] += trembl
171
+ stats_hash['userdb'] += prot_annotated.length - swissprot - trembl
172
+
173
+ #ncRNA
174
+ #----------------
175
+ stats_hash['ncrna'] += seqs.select{|s| s.type == NCRNA}.length
176
+
177
+ #Coding sequences
178
+ #----------------
179
+ coding = seqs.select{|s| s.type == CODING}
180
+ stats_hash['coding'] += coding.length
181
+
182
+ #By Status
183
+ stats_hash['coding_sure'] += coding.select{|s| s.status}.length
184
+ stats_hash['coding_putative'] += coding.select{|s| !s.status}.length
185
+
186
+ #By Length
187
+ stats_hash['coding_>200'] += coding.select{|s| s.fasta_length > low_limit}.length
188
+ stats_hash['coding_>500'] += coding.select{|s| s.fasta_length > upper_limit}.length
189
+
190
+
191
+ return stats_hash, diff_ids_array, diff_ids_complete_array
289
192
  end
290
193
 
194
+ def last_stats(stats_hash, diff_ids_array, diff_ids_complete_array)
195
+ stats_hash['different_orthologues'] = diff_ids_array.length
196
+ stats_hash['different_completes'] = diff_ids_complete_array.length
197
+ #BA index
198
+ if stats_hash['prot_annotated'] > 0 &&
199
+ stats_hash['complete'] > 0 &&
200
+ stats_hash['sequences_>500'] > 0 &&
201
+ stats_hash['different_orthologues'] > 0 &&
202
+ stats_hash['different_completes'] > 0
203
+ coef_anot_geom = (stats_hash['prot_annotated'] * stats_hash['complete'] * 1.0)/(stats_hash['sequences_>500']*10000)
204
+ coef_mejora = (stats_hash['different_orthologues']*1.0 + stats_hash['different_completes'])/(stats_hash['prot_annotated'] + stats_hash['complete'])
205
+ stats_hash['BA_index'] = Math.sqrt(coef_anot_geom*coef_mejora)
206
+ end
207
+
208
+ return stats_hash
209
+ end
291
210
 
211
+ def coding_stats_reptrans(coding_seq, stats_hash)
212
+ group = nil
213
+ if coding_seq.t_code > 1
214
+ group = 'coding_>1'
215
+ elsif coding_seq.t_code > 0.95
216
+ group = 'coding_>0.94'
217
+ elsif coding_seq.t_code > 0.85
218
+ group = 'coding_>0.84'
219
+ elsif coding_seq.t_code > 0.73
220
+ group = 'coding_>0.73'
221
+ elsif coding_seq.t_code > 0
222
+ group = 'coding_>0'
223
+ end
224
+ if !group.nil?
225
+ stats_hash[group] += 1
226
+ end
227
+ end
292
228
 
229
+ def write_summary_stats(stats_hash, stats_taxonomy, diff_ids_array, diff_ids_complete_array, txt_file, html_file)
230
+ stats_hash = last_stats(stats_hash, diff_ids_array, diff_ids_complete_array)
231
+ write_txt(stats_hash, txt_file)
232
+ write_html(stats_hash, html_file, stats_taxonomy)
233
+ end
293
234
 
294
- def html_code
295
- html_head = '<html>
296
- <head>
297
- <title>FLN Summary</title>
298
- </head>
299
-
300
- <body bgcolor="#FFFFFF">
301
- <center>
302
- <h1 align="center">
303
- Full-LengtherNEXT Summary
304
- </h1>'
305
-
306
-
307
- html_1 = '
308
- <h2 align="center">
309
- Status report
310
- </h2>
311
-
312
- <table border="2" cellspacing="0" cellpadding="2">
313
- <tr>
314
- <th colspan="2">Status</th>
315
- <th>Unigenes</th>
316
- <th>%</th>
317
- </tr>'
318
-
319
- html_2= '
320
- <h2 align="center">
321
- Unigene report
322
- </h2>
323
-
324
- <table border="2" cellspacing="0" cellpadding="2">
325
- <tr>
326
- <th></th>
327
- <th>Unigenes</th>
328
- <th>%</th>
329
- </tr>'
330
-
331
- html_3= '
332
- <h2 align="center">
333
- Database usage
334
- </h2>
335
-
336
- <table border="2" cellspacing="0" cellpadding="2">
337
- <tr>
338
- <th></th>
339
- <th>Unigenes</th>
340
- <th>%</th>
341
- </tr>'
342
-
343
- html_4= '
344
- <h2 align="center">
345
- Report guiding assembly quality
346
- </h2>
347
-
348
- <table border="2" cellspacing="0" cellpadding="2">
349
- <tr>
350
- <th></th>
351
- <th>Unigenes</th>
352
- <th>%</th>
353
- </tr>'
354
-
355
- html_5 = ' </body>
356
- </html>'
357
-
358
- return [html_head, html_1, html_2, html_3, html_4, html_5]
235
+ def write_reptrans_stats(stats_hash, html_file, txt_file)
236
+ html = File.open(html_file,'w')
237
+ txt = File.open(txt_file,'w')
238
+ write_txt(stats_hash, txt)
239
+ write_html_reptrans(stats_hash, html)
240
+ end
241
+
242
+ def write_html_reptrans(stats_hash, html_file)
243
+ html_file.puts '<html>'
244
+ header(html_file)
245
+ body_reptrans(html_file, stats_hash)
246
+ html_file.puts '</html>'
247
+ end
359
248
 
249
+ def write_txt(stats_hash, file)
250
+ stats_hash.each do |key, value|
251
+ file.puts "#{value}\t#{key}"
360
252
  end
253
+ end
361
254
 
255
+ def write_html(stats_hash, html_file, stats_taxonomy)
256
+ js_path = File.dirname(html_file.to_path)
257
+ system("unzip -qq #{File.join(File.dirname(__FILE__), '..', '..', 'expresscanvas.zip')} -d #{js_path}") if !File.exists?(File.join(js_path, 'expresscanvas'))
258
+ html_file.puts '<html>'
259
+ html_header(html_file, stats_hash, stats_taxonomy)
260
+ body(html_file, stats_hash)
261
+ html_file.puts '</html>'
262
+ end
362
263
 
363
- def annotation_stats(size_filter1,size_filter2)
364
-
365
- seqs_number = 0
366
- array_of_all_accs = []
367
- array_of_complete_accs = []
368
- error_1_num = 0
369
- uni_500 = 0
370
- uni_200 = 0
371
- longest_one = 0
372
-
373
- status_array = []
374
- # total, status
375
- complete = [0,'Complete']
376
- putative_complete = [0,'Putative Complete']
377
- c_terminus = [0,'C-terminus']
378
- putative_c_terminus = [0,'Putative C-terminus']
379
- n_terminus = [0,'N-terminus']
380
- putative_n_terminus = [0,'Putative N-terminus']
381
- internal = [0,'Internal']
382
- cod_seq = [0,'Misassembled']
383
-
384
- #userdb, SwissProt, TrEMBL
385
- db_usage = [0,0,0]
386
-
387
- File.open('fln_results/dbannotated.txt').each do |line|
388
- line.chomp!
389
- (name,fasta_length,acc,db_name,status,kk1,kk2,kk3,kk4,kk5,msgs) = line.split("\t")
390
-
391
- if (line !~ /^Query_id\t/) && (!line.empty?)
392
- seqs_number += 1
393
- if (fasta_length.to_i > longest_one)
394
- longest_one = fasta_length.to_i
395
- end
396
- array_of_all_accs.push acc
397
-
398
- if (db_name !~ /^sp_/) && (db_name !~ /^tr_/)
399
- db_usage[0] += 1
400
- elsif (db_name =~ /^sp_/)
401
- db_usage[1] += 1
402
- elsif (db_name =~ /^tr_/)
403
- db_usage[2] += 1
404
- end
405
-
406
- # -------------------------------------------------------------------------
407
- if (fasta_length.to_i >= size_filter1)
408
- uni_200 += 1
409
- end
410
- if (fasta_length.to_i >= size_filter2)
411
- uni_500 += 1
412
- end
413
- # -------------------------------------------------------------------------
414
- if (msgs =~ /ERROR#1/)
415
- error_1_num += 1
416
- end
417
- # -------------------------------------------------------------------------
418
- if (status == 'Complete')
419
- complete[0] += 1
420
- array_of_complete_accs.push acc
421
-
422
- elsif (status == 'Putative Complete')
423
- putative_complete[0] += 1
424
- elsif (status == 'C-terminus')
425
- c_terminus[0] += 1
426
- elsif (status == 'N-terminus')
427
- n_terminus[0] += 1
428
- elsif (status == 'Putative C-terminus')
429
- putative_c_terminus[0] += 1
430
- elsif (status == 'Putative N-terminus')
431
- putative_n_terminus[0] += 1
432
- elsif (status == 'Internal')
433
- internal[0] += 1
434
- elsif (status == 'Misassembled')
435
- cod_seq[0] += 1
436
- end
437
- # -------------------------------------------------------------------------
438
- end
264
+ def header(html_file)
265
+ html_file.puts '<head>',
266
+ '<title>FLN Summary</title>',
267
+ '</head>'
268
+ end
439
269
 
440
- end
270
+ def html_header(html_file, stats_hash, stats_taxonomy)
271
+ structural_data_sure = []
272
+ structural_data_sure << stats_hash['unknown']
273
+ structural_data_sure << stats_hash['complete_sure']
274
+ structural_data_sure << stats_hash['n_terminal_sure']
275
+ structural_data_sure << stats_hash['c_terminal_sure']
276
+ structural_data_sure << stats_hash['internal']
277
+ structural_data_sure << stats_hash['ncrna']
278
+ structural_data_sure << stats_hash['coding']
279
+
280
+ structural_data_putative = []
281
+ structural_data_putative << 0
282
+ structural_data_putative << stats_hash['complete_putative']
283
+ structural_data_putative << stats_hash['n_terminal_putative']
284
+ structural_data_putative << stats_hash['c_terminal_putative']
285
+ structural_data_putative << 0
286
+ structural_data_putative << 0
287
+ structural_data_putative << stats_hash['coding_putative']
288
+
289
+ values_structural_sure = "[#{structural_data_sure.map{|stat| stat*100.0/stats_hash['good_seqs']}.join(', ')}]"
290
+ values_structural_putative = "[#{structural_data_putative.map{|stat| stat*100.0/stats_hash['good_seqs']}.join(', ')}]"
291
+
292
+ data = stats_taxonomy.to_a.sort{|s2, s1| s1.last <=> s2.last}[0..20]
293
+ smps_taxonomy = "['#{data.map{|tax| tax.first}.join("', '")}']"
294
+ values_taxonomy = "[#{data.map{|tax| tax.last}.join(', ')}]"
295
+
296
+ html_file.puts '<head>
297
+ <title>FLN Summary</title>
298
+ <meta http-equiv="CACHE-CONTROL" CONTENT="NO-CACHE">
299
+ <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
441
300
 
442
- status_array = [complete, putative_complete, c_terminus, putative_c_terminus, n_terminus, putative_n_terminus, internal, cod_seq]
301
+ <!--[if lt IE 9]><script type="text/javascript" src="./expresscanvas/js/flashcanvas.js"></script><![endif]-->
302
+ <script type="text/javascript" src="./expresscanvas/js/canvasXpress.min.js"></script>
443
303
 
444
- return [status_array, db_usage, seqs_number, error_1_num, array_of_all_accs.uniq.count, array_of_complete_accs.uniq.count, uni_500, uni_200, longest_one]
445
- end
304
+ <script id=\'demoScript\'>
305
+ var showDemo = function () {'
446
306
 
307
+ #'smpTitle': 'Status',
308
+
309
+ html_file.puts "new CanvasXpress('profile',
310
+ {
311
+ 'y' : {
312
+ 'vars' : ['Sure', 'Putative'],
313
+ 'smps' : ['Unknown', 'Complete', 'N-terminal', 'C-terminal', 'Internal', 'ncrna', 'Coding'],
314
+ 'data' : [#{values_structural_sure},
315
+ #{values_structural_putative}],
316
+ },
317
+ 'a' : {
318
+ 'xAxis' : ['Sure', 'Putative']
319
+ },
320
+ },
321
+
322
+ {'gradient': false,
323
+ 'toolbarPermanent': true,
324
+ 'graphOrientation': 'vertical',
325
+ 'graphType': 'Stacked',
326
+ 'legendBackgroundColor': false,
327
+ 'smpLabelScaleFontFactor': 0.8,
328
+ 'xAxisTitle': '% sequences',
329
+ 'xAxis2Show': false,
330
+ 'xAxisExact': true,
331
+ 'setMaxX': 80,
332
+ 'setMinX': 0,
333
+ 'axisTitleScaleFontFactor': 2,
334
+ 'smpTitleFontStyle': 'italic',
335
+ 'titleHeight': 60
336
+ }
337
+ );
338
+
339
+ new CanvasXpress('taxonomy',
340
+ {
341
+ 'y' : {
342
+ 'vars' : ['Annotations'],
343
+ 'smps' : #{smps_taxonomy},
344
+ 'data' : [#{values_taxonomy}],
345
+ },
346
+ 'a' : {
347
+ 'xAxis' : ['Sure', 'Putative']
348
+ },
349
+ },
350
+
351
+ {'gradient': false,
352
+ 'toolbarPermanent': true,
353
+ 'graphOrientation': 'horizontal',
354
+ 'showLegend': false,
355
+ 'smpLabelScaleFontFactor': 1.5,
356
+ 'xAxisTitle': 'Number of sequences',
357
+ 'xAxis2Show': false,
358
+ 'titleHeight': 60
359
+ }
360
+ );
361
+ }
362
+ </script>
363
+ </head>"
364
+
365
+ end
447
366
 
448
- def testcode_stats(size_filter1,size_filter2)
449
-
450
- seqs_number = 0
451
- uni_500 = 0
452
- uni_200 = 0
453
- longest_one = 0
454
-
455
- # total, status
456
- coding_stats = [0,'Coding',0,0]
457
- p_coding_stats = [0,'Putative Coding',0,0]
458
- unknown_stats = [0,'Unknown',0,0]
459
-
460
- File.open('fln_results/new_coding.txt').each do |line|
461
- line.chomp!
462
- (name,fasta_length,acc,db_name,status) = line.split("\t")
463
-
464
- if (line !~ /^Query_id\t/) && (!line.empty?)
465
- seqs_number += 1
466
-
467
- if (fasta_length.to_i > longest_one)
468
- longest_one = fasta_length.to_i
469
- end
470
-
471
- # -------------------------------------------------------------------------
472
- if (fasta_length.to_i >= size_filter1)
473
- uni_200 += 1
474
- end
475
- if (fasta_length.to_i >= size_filter2)
476
- uni_500 += 1
477
- end
478
- # -------------------------------------------------------------------------
479
-
480
- if (fasta_length.to_i > size_filter1)
481
- if (status == 'coding')
482
- coding_stats[2] += 1
483
- elsif (status == 'putative_coding')
484
- p_coding_stats[2] += 1
485
- elsif (status == 'unknown')
486
- unknown_stats[2] += 1
487
- end
488
- end
489
-
490
- if (fasta_length.to_i > size_filter2)
491
- if (status == 'coding')
492
- coding_stats[3] += 1
493
- elsif (status == 'putative_coding')
494
- p_coding_stats[3] += 1
495
- elsif (status == 'unknown')
496
- unknown_stats[3] += 1
497
- end
498
- end
499
-
500
- if (status == 'coding')
501
- coding_stats[0] += 1
502
- elsif (status == 'putative_coding')
503
- p_coding_stats[0] += 1
504
- elsif (status == 'unknown')
505
- unknown_stats[0] += 1
506
- end
367
+ def body_reptrans(html_file, stats_hash)
368
+ html_file.puts '<body bgcolor="#FFFFFF" >', '<center>' # Start body
369
+ html_file.puts '<div style="float:center; font-size:30; margin:10px"><b>', 'Full-LengtherNEXT Representative Transcriptome Summary', '</b></div>'
370
+ # TABLES
371
+ html_file.puts '<div style=" width: 850px; height: 350px; padding: 10 ">'
372
+ reptrans_report(html_file, stats_hash, 'left')
373
+ reptrans_acumulative_report(html_file, stats_hash, 'rigth')
374
+ html_file.puts '</div>'
375
+ # END TABLES
376
+ html_file.puts '</center>', '</body>' # End body
377
+ end
507
378
 
508
- end
379
+ def body(html_file, stats_hash)
380
+ html_file.puts '<body bgcolor="#FFFFFF" onload="showDemo(); id=demo">', '<center>' # Start body
381
+ html_file.puts '<div style="float:center; font-size:30; margin:10px"><b>', 'Full-LengtherNEXT Summary', '</b></div>'
382
+
383
+ # TABLES
384
+ html_file.puts '<div style="overflow: hidden; width: 950px; height: 550px; padding: 10 ">'
385
+ general_report(html_file, stats_hash, 'left')
386
+ assembly_report(html_file, stats_hash, 'right')
387
+ html_file.puts '</div>'
388
+ html_file.puts '<div style="overflow: hidden; width: 950px; height: 550px; padding: 10 ">'
389
+ status_graph(html_file, 'left')
390
+ status_report(html_file, stats_hash, 'rigth')
391
+ html_file.puts '</div>'
392
+ html_file.puts '<div style="overflow: hidden; width: 950px; height: 750px; padding: 10 ">'
393
+ taxonomy_graph(html_file, 'left')
394
+ database_report(html_file, stats_hash, 'rigth')
395
+ html_file.puts '</div>'
396
+ # END TABLES
397
+ html_file.puts '</center>', '</body>' # End body
398
+ end
509
399
 
510
- end
511
400
 
512
- status_array = [coding_stats, p_coding_stats, unknown_stats]
513
401
 
514
- return [status_array, seqs_number, uni_500, uni_200, longest_one]
515
- end
402
+ def reptrans_report(html_file, stats_hash, align)
403
+ html = []
404
+ all_seqs = 0
405
+ stats_hash.values.map{|v| all_seqs += v}
406
+ html << '<div style=" margin: 0; float:'+align+'">'
407
+ html << table_title('Sequences info')
408
+ html.concat(table_header(['', 'Sequences', '%'], 0))
409
+ html.concat(single_row('Output', all_seqs, all_seqs))
410
+ html.concat(single_row('Annotated with protein', stats_hash['prot_annotated'], all_seqs))
411
+ html.concat(single_row('Annotated with EST', stats_hash['est_annotated'], all_seqs))
412
+ html.concat(single_row('Coding test-code > 1', stats_hash['coding_>1'], all_seqs))
413
+ html.concat(single_row('Coding test-code > 0.94', stats_hash['coding_>0.94'], all_seqs))
414
+ html.concat(single_row('Coding test-code > 0.84', stats_hash['coding_>0.84'], all_seqs))
415
+ html.concat(single_row('Coding test-code > 0.73', stats_hash['coding_>0.73'], all_seqs))
416
+ html.concat(single_row('Coding test-code > 0', stats_hash['coding_>0'], all_seqs))
417
+ html << '</table>'
418
+ html << '</div>'
419
+ write_array_html(html, html_file)
420
+ end
516
421
 
422
+ def reptrans_acumulative_report(html_file, stats_hash, align)
423
+ html = []
424
+ all_seqs = 0
425
+ stats_hash.values.map{|v| all_seqs += v}
426
+ html << '<div style=" margin: 0; float:'+align+'">'
427
+ html << table_title('Sequences summary (Acumulative)')
428
+ html.concat(table_header(['', 'Sequences', '%'], 0))
429
+ acumulative = 0
430
+ html.concat(single_row('Annotated with protein', stats_hash['prot_annotated'], all_seqs))
431
+ acumulative += stats_hash['prot_annotated']
432
+ html.concat(single_row('Annotated with EST', stats_hash['est_annotated'] + acumulative, all_seqs))
433
+ acumulative += stats_hash['est_annotated']
434
+ html.concat(single_row('Coding test-code > 1', stats_hash['coding_>1'] + acumulative, all_seqs))
435
+ acumulative += stats_hash['coding_>1']
436
+ html.concat(single_row('Coding test-code > 0.94', stats_hash['coding_>0.94'] + acumulative, all_seqs))
437
+ acumulative += stats_hash['coding_>0.94']
438
+ html.concat(single_row('Coding test-code > 0.84', stats_hash['coding_>0.84'] + acumulative, all_seqs))
439
+ acumulative += stats_hash['coding_>0.84']
440
+ html.concat(single_row('Coding test-code > 0.73', stats_hash['coding_>0.73'] + acumulative, all_seqs))
441
+ html << '</table>'
442
+ html << '</div>'
443
+ write_array_html(html, html_file)
444
+ end
445
+
446
+ def general_report(html_file, stats_hash, align)
447
+ html = []
448
+ html << '<div style="margin: 0; float:'+align+'">'
449
+ html << table_title('General info')
450
+ html.concat(table_header(['', 'Sequences', '%'], 0))
451
+ html.concat(single_row('Input', stats_hash['input_seqs'], stats_hash['input_seqs']))
452
+ html.concat(single_row('Failing sequences', stats_hash['failed'], stats_hash['output_seqs']))
453
+ html.concat(single_row('Artifacts <sup>1</sup>', stats_hash['artifacts'], stats_hash['output_seqs']))
454
+ html.concat(single_row('Misassembled', stats_hash['misassembled'], stats_hash['artifacts'], TRUE))
455
+ html.concat(single_row('Chimeras', stats_hash['chimeras'], stats_hash['artifacts'], TRUE))
456
+ html.concat(single_row('Other', stats_hash['other_artifacts'], stats_hash['artifacts'], TRUE))
457
+ html.concat(single_row('Sequences with resolved chimeras', stats_hash['output_seqs'], stats_hash['input_seqs']))
458
+ html.concat(single_row('Sequences without artifacts', stats_hash['good_seqs'], stats_hash['output_seqs']))
459
+ html.concat(single_row('BA index', "%5.2f" % [stats_hash['BA_index']], nil)) if stats_hash['BA_index'] > 0
460
+ html << '</table>'
461
+ html << '</div>'
462
+ write_array_html(html, html_file)
463
+ end
517
464
 
518
- def ncrna_stats(size_filter1,size_filter2)
465
+ def taxonomy_graph(html_file, align)
466
+ html_file.puts '<div style=\'float:'+align+'\'>'
467
+ html_file.puts table_title('Taxonomy distribution on annotations')
468
+ html_file.puts '<table >
469
+ <tr>
470
+ <td>
471
+ <canvas id=\'taxonomy\' width=\'540\' height=\'640\'></canvas>
472
+ </td>
473
+ </tr>
474
+ </table>
475
+ </div>'
476
+
477
+ end
519
478
 
520
- uni_500 = 0
521
- uni_200 = 0
522
- nc_total = 0
523
- longest_one = 0
524
479
 
525
- File.open('fln_results/nc_rnas.txt').each do |line|
526
- line.chomp!
527
- (name,fasta_length,acc,db_name,status) = line.split("\t")
480
+ def database_report(html_file, stats_hash, align)
481
+ html = []
482
+ html << '<div style=" margin: 0 float:'+align+'">'
483
+ html << table_title('Database usage')
484
+ html.concat(table_header(['', 'Unigenes', '%'], 0))
485
+ html.concat(single_row('UserDB', stats_hash['userdb'], stats_hash['good_seqs']))
486
+ html.concat(single_row('SwissProt', stats_hash['swissprot'], stats_hash['good_seqs']))
487
+ html.concat(single_row('TrEMBL', stats_hash['trembl'], stats_hash['good_seqs']))
488
+ html.concat(single_row('ncRNA', stats_hash['ncrna'], stats_hash['good_seqs']))
489
+ html.concat(single_row('None', stats_hash['coding']+ stats_hash['unknown'], stats_hash['good_seqs']))
490
+ html.concat(single_row('Total', stats_hash['good_seqs'], stats_hash['good_seqs']))
491
+ html << '</table>'
492
+ html << '</div>'
493
+ write_array_html(html, html_file)
494
+ end
528
495
 
529
- if (status == 'Putative ncRNA')
496
+ def assembly_report(html_file, stats_hash, align)
497
+ html = []
498
+ html << '<div style=" margin: 0; float:'+align+'">'
499
+ html << table_title('Report guiding assembly quality')
500
+ html.concat(table_header(['', 'Unigenes', '%'], 0))
501
+ html.concat(single_row('Unigenes', stats_hash['good_seqs'], stats_hash['good_seqs']))
502
+ html.concat(single_row('Unigenes >500pb', stats_hash['sequences_>500'], stats_hash['good_seqs']))
503
+ html.concat(single_row('Unigenes >200pb', stats_hash['sequences_>200'], stats_hash['good_seqs']))
504
+ html.concat(single_row('Longest unigene', stats_hash['longest_unigene'], nil))
505
+ html.concat(single_row('With orthologue <sup>1</sup>', stats_hash['prot_annotated'], stats_hash['good_seqs']))
506
+ html.concat(single_row('Different orthologue IDs', stats_hash['different_orthologues'], stats_hash['prot_annotated'], TRUE))
507
+ html.concat(single_row('Complete transcripts', stats_hash['complete'], stats_hash['prot_annotated'], TRUE))
508
+ html.concat(single_row('Different complete transcripts ', stats_hash['different_completes'], stats_hash['prot_annotated'], TRUE))
509
+ html.concat(single_row('ncRNA', stats_hash['ncrna'], stats_hash['good_seqs']))
510
+ without_orthologue = stats_hash['coding']+ stats_hash['unknown']
511
+ html.concat(single_row('Without orthologue <sup>1</sup>', without_orthologue, stats_hash['good_seqs']))
512
+ html.concat(single_row('Coding (all)', stats_hash['coding'], without_orthologue, TRUE))
513
+ html.concat(single_row('Coding > 200bp', stats_hash['coding_>200'], without_orthologue, TRUE))
514
+ html.concat(single_row('Coding > 500bp', stats_hash['coding_>500'], without_orthologue, TRUE))
515
+ html.concat(single_row('Unknown (all)', stats_hash['unknown'], without_orthologue, TRUE))
516
+ html.concat(single_row('Unknown > 200bp', stats_hash['unknown_>200'], without_orthologue, TRUE))
517
+ html.concat(single_row('Unknown > 500bp', stats_hash['unknown_>500'], without_orthologue, TRUE))
518
+ html << '</table>'
519
+ html << '<sup>1</sup> Percents for subclassifications of this category <br> were calculated using this line as 100% reference.'
520
+ html << '</div>'
521
+ write_array_html(html, html_file)
522
+ end
530
523
 
531
- if (fasta_length.to_i > longest_one)
532
- longest_one = fasta_length.to_i
533
- end
534
- # -------------------------------------------------------------------------
535
- if (fasta_length.to_i >= size_filter1)
536
- uni_200 += 1
537
- end
538
- if (fasta_length.to_i >= size_filter2)
539
- uni_500 += 1
540
- end
541
- # -------------------------------------------------------------------------
524
+ def status_graph(html_file, align)
525
+ html_file.puts '<div style=\'float:'+align+'\'>'
526
+ html_file.puts table_title('Structural profile')
527
+ html_file.puts '<table >
528
+ <tr>
529
+ <td>
530
+ <canvas id=\'profile\' width=\'500\' height=\'440\'></canvas>
531
+ </td>
532
+ </tr>
533
+ </table>
534
+ </div>'
542
535
 
543
- nc_total += 1
536
+ end
544
537
 
545
- end
538
+ def status_report(html_file, stats_hash, align)
539
+ html = []
540
+ html << '<div style=" margin: 0; float:'+align+'">'
541
+ html << table_title('Status report')
542
+ html.concat(table_header(['Status', 'Unigenes', '%'], 2))
543
+ html.concat(fused_row('Complete', stats_hash['complete_sure'], stats_hash['complete_putative'], stats_hash['good_seqs']))
544
+ html.concat(fused_row('C-terminus', stats_hash['c_terminal_sure'], stats_hash['c_terminal_putative'], stats_hash['good_seqs']))
545
+ html.concat(fused_row('N-terminus', stats_hash['n_terminal_sure'], stats_hash['n_terminal_putative'], stats_hash['good_seqs']))
546
+ html.concat(composed_single_row('Internal', stats_hash['internal'], stats_hash['good_seqs']))
547
+ html.concat(fused_row('Coding', stats_hash['coding_sure'], stats_hash['coding_putative'], stats_hash['good_seqs']))
548
+ html.concat(composed_single_row('ncRNA', stats_hash['ncrna'], stats_hash['good_seqs']))
549
+ html.concat(composed_single_row('Unknown', stats_hash['unknown'], stats_hash['good_seqs']))
550
+ html.concat(composed_single_row('Total', stats_hash['good_seqs'], stats_hash['good_seqs']))
551
+ html << '</table>'
552
+ html << '</div>'
553
+ write_array_html(html, html_file)
554
+ end
555
+
556
+
557
+ def table_title(title)
558
+ html = '<div style="font-size:25px; margin: 10"><b>'+title+'</b></div>'
559
+ return html
560
+ end
561
+
562
+ def table_header(col_array, colspan)
563
+ html = []
564
+
565
+ html << '<table border="2" cellspacing="0" cellpadding="2">'
566
+ # Table header
567
+ html << '<tr>'
568
+ col_array.each_with_index do |col,i|
569
+ if i == 0 && colspan > 0
570
+ html << '<th colspan="'+colspan.to_s+'">'+col+'</th>'
571
+ else
572
+ html << '<th>'+col+'</th>'
546
573
  end
574
+ end
575
+ html << '</tr>'
576
+ return html
577
+ end
547
578
 
548
- return [nc_total, uni_500, uni_200, longest_one]
579
+ def single_row(name, magnitude, total, space = FALSE)
580
+ if space
581
+ name = '&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;'+ name
549
582
  end
583
+ html = []
584
+ html << '<tr>'
585
+ html << '<td align="left">'+name+'</td>'
586
+ html.concat(sub_row(magnitude, total))
587
+ html << '</tr>'
588
+ return html
589
+ end
550
590
 
551
- def chimera_stats(size_filter1,size_filter2)
552
-
553
- uni_500 = 0
554
- uni_200 = 0
555
- ch_total = 0
556
- longest_one = 0
557
- db_usage = [0,0,0]
558
-
559
- if !File.exists?('fln_results/chimeric_sequences.txt')
560
- return [0, 0, 0, longest_one, db_usage]
591
+
592
+ def fused_row(type, sure_magnitude, putative_magnitude, total)
593
+ html = []
594
+ html << '<td rowspan="2" align="left">'+type+'</td>'
595
+ html << seq_status('Sure')
596
+ html.concat(sub_row(sure_magnitude, total))
597
+ html << '</tr>'
598
+ html << '<tr>'
599
+ html << seq_status('Putative')
600
+ html.concat(sub_row(putative_magnitude, total))
601
+ html << '</tr>'
602
+ return html
603
+ end
604
+
605
+ def seq_status(status)
606
+ html = '<td align="left">'+status+'</td>'
607
+ return html
608
+ end
609
+
610
+ def sub_row(magnitude, total)
611
+ if !total.nil?
612
+ perc_float = magnitude*100.0/total
613
+ if !perc_float.nan?
614
+ percentage = '%.2f' % perc_float.to_s
615
+ percentage += '%'
561
616
  else
562
- File.open('fln_results/chimeric_sequences.txt').each do |line|
563
- line.chomp!
564
- if (!line.empty?)
565
- (name,fasta_length,acc,db_name,status) = line.split("\t")
566
- if (status == 'Putative chimera')
567
- if (fasta_length.to_i > longest_one)
568
- longest_one = fasta_length.to_i
569
- end
570
- # -------------------------------------------------------------------------
571
- if (fasta_length.to_i >= size_filter1)
572
- uni_200 += 1
573
- end
574
- if (fasta_length.to_i >= size_filter2)
575
- uni_500 += 1
576
- end
577
- # -------------------------------------------------------------------------
578
- if (db_name =~ /^sp_/)
579
- db_usage[1] += 1
580
- elsif (db_name =~ /^tr_/)
581
- db_usage[2] += 1
582
- else
583
- db_usage[0] += 1
584
- end
585
- # -------------------------------------------------------------------------
586
- ch_total += 1
587
- end
588
- end
589
- end
590
-
591
- db_usage.each_with_index do |db,i|
592
- db_usage[i] = db/2
593
- end
594
-
595
- return [(ch_total/2), (uni_500/2), (uni_200/2), longest_one, db_usage]
617
+ percentage ='-'
596
618
  end
597
- end
619
+ else
620
+ percentage = '-'
621
+ end
622
+ html = []
623
+ html << '<td align="right">'+magnitude.to_s+'</td>'
624
+ html << '<td align="right">'+percentage+'</td>'
625
+ return html
626
+ end
627
+
628
+ def composed_single_row(type, magnitude, total)
629
+ html = []
630
+ html << '<tr>'
631
+ html << '<td colspan="2" align="left">'+type+'</td>'
632
+ html.concat(sub_row(magnitude, total))
633
+ html << '</tr>'
634
+ return html
635
+ end
598
636
 
599
637
 
638
+ def write_array_html(html, html_file)
639
+ html.map{|line| html_file.puts line}
640
+ end
600
641
  end