full_lengther_next 0.0.8 → 0.5.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (36) hide show
  1. data/.gemtest +0 -0
  2. data/History.txt +2 -2
  3. data/Manifest.txt +33 -18
  4. data/Rakefile +4 -2
  5. data/bin/download_fln_dbs.rb +310 -158
  6. data/bin/full_lengther_next +160 -103
  7. data/bin/make_test_dataset.rb +236 -0
  8. data/bin/make_user_db.rb +101 -117
  9. data/bin/plot_fln.rb +270 -0
  10. data/bin/plot_taxonomy.rb +70 -0
  11. data/lib/expresscanvas.zip +0 -0
  12. data/lib/full_lengther_next.rb +3 -3
  13. data/lib/full_lengther_next/classes/artifacts.rb +66 -0
  14. data/lib/full_lengther_next/classes/blast_functions.rb +326 -0
  15. data/lib/full_lengther_next/classes/cdhit.rb +154 -0
  16. data/lib/full_lengther_next/classes/chimeric_seqs.rb +315 -57
  17. data/lib/full_lengther_next/classes/common_functions.rb +105 -63
  18. data/lib/full_lengther_next/classes/exonerate_result.rb +258 -0
  19. data/lib/full_lengther_next/classes/fl_analysis.rb +226 -617
  20. data/lib/full_lengther_next/classes/fl_string_utils.rb +4 -2
  21. data/lib/full_lengther_next/classes/fln_stats.rb +598 -557
  22. data/lib/full_lengther_next/classes/handle_db.rb +30 -0
  23. data/lib/full_lengther_next/classes/my_worker.rb +308 -138
  24. data/lib/full_lengther_next/classes/my_worker_EST.rb +54 -0
  25. data/lib/full_lengther_next/classes/my_worker_manager_EST.rb +69 -0
  26. data/lib/full_lengther_next/classes/my_worker_manager_fln.rb +389 -0
  27. data/lib/full_lengther_next/classes/nc_rna.rb +5 -7
  28. data/lib/full_lengther_next/classes/reptrans.rb +210 -0
  29. data/lib/full_lengther_next/classes/sequence.rb +439 -80
  30. data/lib/full_lengther_next/classes/test_code.rb +15 -16
  31. data/lib/full_lengther_next/classes/types.rb +12 -0
  32. data/lib/full_lengther_next/classes/une_los_hit.rb +148 -230
  33. data/lib/full_lengther_next/classes/warnings.rb +40 -0
  34. metadata +207 -93
  35. data/lib/full_lengther_next/classes/lcs.rb +0 -33
  36. data/lib/full_lengther_next/classes/my_worker_manager.rb +0 -240
@@ -5,7 +5,7 @@ class String
5
5
  s = self.upcase
6
6
  a = s.split('').each_slice(3).map{|e| e.join}
7
7
 
8
- c={'GCT'=>'A', 'GCC'=>'A','GCA'=>'A','GCG'=>'A',
8
+ c={ 'GCT'=>'A','GCC'=>'A','GCA'=>'A','GCG'=>'A',
9
9
  'CGT'=>'R','CGC'=>'R','CGA'=>'R','CGG'=>'R','AGA'=>'R','AGG'=>'R',
10
10
  'AAT'=>'N','AAC'=>'N',
11
11
  'GAT'=>'D','GAC'=>'D',
@@ -37,6 +37,8 @@ class String
37
37
  else
38
38
  c[e]||'x'
39
39
  end
40
+ else
41
+ 'x'
40
42
  end
41
43
  }
42
44
  return res.compact.join
@@ -136,4 +138,4 @@ class String
136
138
  return self.reverse.split('').map{|e| c[e]}.join
137
139
  end
138
140
 
139
- end
141
+ end
@@ -1,600 +1,641 @@
1
+ require 'types.rb'
1
2
 
2
3
  module FlnStats
3
-
4
- def summary_stats
5
- stats_file = File.open('fln_results/summary_stats.html', 'w')
6
-
7
- size_filter1 = 200
8
- size_filter2 = 500
9
-
10
- # recogemos los trozos de html fijos
11
- (html_head, html_st, html_uni, html_db, html_as, html_end) = html_code
12
-
13
- total_seqs = 0
14
- status_suma = 0
15
- #recogemos los datos que necesitamos de los ficheros de resultados
16
- (status_array, db_usage, seqs_number1, error_1_num, seq_uniq, complete_uniq, db_uni_500, db_uni_200, db_longest_one) = annotation_stats(size_filter1,size_filter2)
17
- (tcode_array, seqs_number2, tc_uni_500, tc_uni_200, tc_longest_one) = testcode_stats(size_filter1,size_filter2)
18
- (ncrna_total, nc_uni_500, nc_uni_200, nc_longest_one)=ncrna_stats(size_filter1,size_filter2)
19
- (chimera_total, ch_uni_500, ch_uni_200, ch_longest_one, ch_db_usage)=chimera_stats(size_filter1,size_filter2)
20
-
21
- seqs_number1 = (seqs_number1+chimera_total.to_i)
22
- total_seqs = (seqs_number1 + seqs_number2 + ncrna_total.to_i)
23
- uni_500 = (db_uni_500 + tc_uni_500 + nc_uni_500 + ch_uni_500)
24
- uni_200 = (db_uni_200 + tc_uni_200 + nc_uni_200 + ch_uni_200)
25
- longest_one = [db_longest_one, tc_longest_one, nc_longest_one, ch_longest_one].max
26
- db_usage[0] += ch_db_usage[0]
27
- db_usage[1] += ch_db_usage[1]
28
- db_usage[2] += ch_db_usage[2]
29
- stats_file.puts html_head
30
-
31
- if (total_seqs.to_i > 0)
32
- # imprimimos la tabla Status Report --------------------------------------------------------------------------------------------
33
- stats_file.puts html_st
34
- status_array.each do |status|
35
- if (status[1] == 'Internal') || (status[1] == 'Misassembled')
36
- stats_file.puts ' <tr>
37
- <td colspan="2" align="left">'+status[1].to_s+'</td>
38
- <td align="right">'+status[0].to_s+'</td>
39
- <td align="right">'+'%.2f' % (100*status[0].to_f/total_seqs.to_f).to_s+' %</td>
40
- </tr>'
41
- elsif (status[1] =~ /^Putative/)
42
- stats_file.puts ' <tr>
43
- <td align="left">Putative</td>
44
- <td align="right">'+status[0].to_s+'</td>
45
- <td align="right">'+'%.2f' % (100*status[0].to_f/total_seqs.to_f).to_s+' %</td>
46
- </tr>'
47
- else
48
- stats_file.puts ' <tr>
49
- <td rowspan="2" align="left">'+status[1].to_s+'</td>
50
- <td align="left">Sure</td>
51
- <td align="right">'+status[0].to_s+'</td>
52
- <td align="right">'+'%.2f' % (100*status[0].to_f/total_seqs.to_f).to_s+' %</td>
53
- </tr>'
54
- end
55
- status_suma += status[0]
56
- end
57
-
58
- # adding chimeric seqs
59
- stats_file.puts ' <tr>
60
- <td colspan="2" align="left">Putative chimera</td>
61
- <td align="right">'+chimera_total.to_s+'</td>
62
- <td align="right">'+'%.2f' % (100*chimera_total.to_f/total_seqs.to_f).to_s+' %</td>
63
- </tr>'
64
- status_suma += chimera_total
65
-
66
- # añadimos los coding, P.coding
67
- tcode_array.each do |status|
68
- if (status[1] == 'Coding')
69
- stats_file.puts ' <tr>
70
- <td rowspan="2" align="left">'+status[1].to_s+'</td>
71
- <td align="left">Sure</td>
72
- <td align="right">'+status[0].to_s+'</td>
73
- <td align="right">'+'%.2f' % (100*status[0].to_f/total_seqs.to_f).to_s+' %</td>
74
- </tr>'
75
- elsif (status[1] == 'Putative Coding')
76
- stats_file.puts ' <tr>
77
- <td align="left">Putative</td>
78
- <td align="right">'+status[0].to_s+'</td>
79
- <td align="right">'+'%.2f' % (100*status[0].to_f/total_seqs.to_f).to_s+' %</td>
80
- </tr>'
4
+ def initialize_stats_hash
5
+ stats_hash = {}
6
+ stats_hash['input_seqs'] = 0
7
+ stats_hash['output_seqs'] = 0
8
+ stats_hash['failed'] = 0
9
+ stats_hash['sequences_>200'] = 0
10
+ stats_hash['sequences_>500'] = 0
11
+ stats_hash['longest_unigene'] = 0
12
+ stats_hash['good_seqs'] = 0
13
+ stats_hash['artifacts'] = 0
14
+ stats_hash['misassembled'] = 0
15
+ stats_hash['chimeras'] = 0
16
+ stats_hash['other_artifacts'] = 0
17
+ stats_hash['unknown'] = 0
18
+ stats_hash['unknown_>200'] = 0
19
+ stats_hash['unknown_>500'] = 0
20
+ stats_hash['prot_annotated'] = 0
21
+ stats_hash['complete'] = 0
22
+ stats_hash['complete_sure'] = 0
23
+ stats_hash['complete_putative'] = 0
24
+ stats_hash['n_terminal'] = 0
25
+ stats_hash['n_terminal_sure'] = 0
26
+ stats_hash['n_terminal_putative'] = 0
27
+ stats_hash['c_terminal'] = 0
28
+ stats_hash['c_terminal_sure'] = 0
29
+ stats_hash['c_terminal_putative'] = 0
30
+ stats_hash['internal'] = 0
31
+ stats_hash['swissprot'] = 0
32
+ stats_hash['trembl'] = 0
33
+ stats_hash['userdb'] = 0
34
+ stats_hash['ncrna'] = 0
35
+ stats_hash['coding'] = 0
36
+ stats_hash['coding_sure'] = 0
37
+ stats_hash['coding_putative'] = 0
38
+ stats_hash['coding_>200'] = 0
39
+ stats_hash['coding_>500'] = 0
40
+ stats_hash['different_orthologues'] = 0
41
+ stats_hash['different_completes'] = 0
42
+ stats_hash['BA_index'] = 0
43
+
44
+ return stats_hash
45
+ end
46
+
47
+ def get_taxonomy(name, taxonomy)
48
+ organism = nil
49
+ if name.include?('OS=')
50
+ fields = name.split('OS=',2)
51
+ organism = fields.last.split(' GN=').first.strip
52
+ elsif name[0..2] = 'sp=' || name[0..2] = 'tr='
53
+ name =~ /(\w+ \w+) \(([\w ]+)\) \(([\w ]+)\)/
54
+ if !$1.nil?
55
+ organism = $1
56
+ else
57
+ name =~ /(\w+ \w+) \(([\w ]+)\)/
58
+ if !$1.nil?
59
+ organism = $1
81
60
  end
82
- status_suma += status[0]
83
61
  end
84
-
85
- # se ponen los ncRNA
86
- stats_file.puts ' <tr>
87
- <td colspan="2" align="left">Putative ncRNA</td>
88
- <td align="right">'+ncrna_total.to_s+'</td>
89
- <td align="right">'+'%.2f' % (100*ncrna_total.to_f/total_seqs.to_f).to_s+' %</td>
90
- </tr>'
91
- status_suma += ncrna_total
92
- # se ponen los unknown
93
- tcode_array.each do |status|
94
- if (status[1] =~ /Unknown/i)
95
- stats_file.puts ' <tr>
96
- <td colspan="2" align="left">'+status[1].to_s+'</td>
97
- <td align="right">'+status[0].to_s+'</td>
98
- <td align="right">'+'%.2f' % (100*status[0].to_f/total_seqs.to_f).to_s+' %</td>
99
- </tr>'
100
- end
62
+ else
63
+ organism = name.split(";",2).last
64
+ organism = organism.split('.', 2).first
65
+ organism.gsub!(/\(\D+\)/,'')
66
+ if organism.split(' ').length > 1
67
+ organism.gsub!('.','')
68
+ organism.gsub!(/^ /,'')
69
+ organism.gsub!(' ','')
70
+ organism.strip!
101
71
  end
102
- #se añade el total
103
- stats_file.puts ' <tr>
104
- <td colspan="2" align="left">Total</td>
105
- <td align="right">'+status_suma.to_s+'</td>
106
- <td align="right">'+'%.2f' % (100*status_suma.to_f/total_seqs.to_f).to_s+' %</td>
107
- </tr>
108
- </table>'
109
-
110
-
111
- # imprimimos la tabla Unigene Report --------------------------------------------------------------------------------------------
112
- new_genes = tcode_array[0][0] + tcode_array[1][0]
113
- total_uni = (seqs_number1 + new_genes + ncrna_total + tcode_array[2][0])
114
- stats_file.puts html_uni
115
- stats_file.puts ' <tr>
116
- <td align="left">With orthologue in DBs</td>
117
- <td align="right">'+seqs_number1.to_s+'</td>
118
- <td align="right">'+'%.2f' % (100*seqs_number1.to_f/total_seqs.to_f).to_s+' %</td>
119
- </tr>'
120
- stats_file.puts ' <tr>
121
- <td align="left">Putative New Genes</td>
122
- <td align="right">'+new_genes.to_s+'</td>
123
- <td align="right">'+'%.2f' % (100*new_genes.to_f/total_seqs.to_f).to_s+' %</td>
124
- </tr>'
125
- stats_file.puts ' <tr>
126
- <td align="left">ncRNAs</td>
127
- <td align="right">'+ncrna_total.to_s+'</td>
128
- <td align="right">'+'%.2f' % (100*ncrna_total.to_f/total_seqs.to_f).to_s+' %</td>
129
- </tr>'
130
- stats_file.puts ' <tr>
131
- <td align="left">Unknown</td>
132
- <td align="right">'+tcode_array[2][0].to_s+'</td>
133
- <td align="right">'+'%.2f' % (100*tcode_array[2][0].to_f/total_seqs.to_f).to_s+' %</td>
134
- </tr>'
135
- stats_file.puts ' <tr>
136
- <td align="left">Total</td>
137
- <td align="right">'+total_uni.to_s+'</td>
138
- <td align="right">'+'%.2f' % (100*total_uni.to_f/total_seqs.to_f).to_s+' %</td>
139
- </tr>
140
- </table>'
72
+ end
73
+ if !organism.nil?
74
+ organism = organism.split(' ')[0..1].join(' ')
75
+ if taxonomy[organism].nil?
76
+ taxonomy[organism] = 1
77
+ else
78
+ taxonomy[organism] += 1
79
+ end
80
+ end
81
+ end
141
82
 
142
- # imprimimos la tabla Database Usage --------------------------------------------------------------------------------------------
143
- stats_file.puts html_db
144
- db_names=["UserDB", "SwissProt", "TrEMBL"]
145
- total_db = 0
146
-
147
- for i in 0..db_usage.length-1 do i
148
- total_db += db_usage[i]
149
- stats_file.puts ' <tr>
150
- <td align="left">'+db_names[i].to_s+'</td>
151
- <td align="right">'+db_usage[i].to_s+'</td>
152
- <td align="right">'+'%.2f' % (100*db_usage[i].to_f/total_seqs.to_f).to_s+' %</td>
153
- </tr>'
154
- end
155
- no_db = seqs_number2 + ncrna_total.to_i
156
- stats_file.puts ' <tr>
157
- <td align="left">None</td>
158
- <td align="right">'+no_db.to_s+'</td>
159
- <td align="right">'+'%.2f' % (100*no_db.to_f/total_seqs.to_f).to_s+' %</td>
160
- </tr>'
161
- total_db += no_db
162
- stats_file.puts ' <tr>
163
- <td align="left">Total</td>
164
- <td align="right">'+total_db.to_s+'</td>
165
- <td align="right">'+'%.2f' % (100*total_db.to_f/total_seqs.to_f).to_s+' %</td>
166
- </tr>
167
- </table>'
168
-
169
- # imprimimos la tabla Report guiding assembly quality -------------------------------------------------------------
170
- stats_file.puts html_as
171
- stats_file.puts ' <tr>
172
- <td align="left">Unigenes</td>
173
- <td align="right">'+total_seqs.to_s+'</td>
174
- <td align="right">'+'%.2f' % (100*total_seqs.to_f/total_seqs.to_f).to_s+' %</td>
175
- </tr>'
176
- stats_file.puts ' <tr>
177
- <td align="left">Unigenes >'+size_filter2.to_s+'pb</td>
178
- <td align="right">'+uni_500.to_s+'</td>
179
- <td align="right">'+'%.2f' % (100*uni_500.to_f/total_seqs.to_f).to_s+' %</td>
180
- </tr>'
181
- stats_file.puts ' <tr>
182
- <td align="left">Unigenes >'+size_filter1.to_s+'pb</td>
183
- <td align="right">'+uni_200.to_s+'</td>
184
- <td align="right">'+'%.2f' % (100*uni_200.to_f/total_seqs.to_f).to_s+' %</td>
185
- </tr>'
186
- stats_file.puts ' <tr>
187
- <td align="left">Longest unigene</td>
188
- <td align="right">'+longest_one.to_s+'</td>
189
- <td align="right">-</td>
190
- </tr>'
191
- stats_file.puts ' <tr>
192
- <td align="left">With orthologue <sup>1</sup></td>
193
- <td align="right">'+seqs_number1.to_s+'</td>
194
- <td align="right">'+'%.2f' % (100*seqs_number1.to_f/total_seqs.to_f).to_s+' %</td>
195
- </tr>'
196
-
197
- if (seqs_number1.to_i > 0)
198
- stats_file.puts ' <tr>
199
- <td align="left">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;Different orthologue IDs</td>
200
- <td align="right">'+seq_uniq.to_s+'</td>
201
- <td align="right">'+'%.2f' % (100*seq_uniq.to_f/seqs_number1.to_f).to_s+' %</td>
202
- </tr>'
203
- stats_file.puts ' <tr>
204
- <td align="left">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;Complete transcripts</td>
205
- <td align="right">'+status_array[0][0].to_s+'</td>
206
- <td align="right">'+'%.2f' % (100*status_array[0][0].to_f/seqs_number1.to_f).to_s+' %</td>
207
- </tr>'
208
- stats_file.puts ' <tr>
209
- <td align="left">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;Different complete transcripts</td>
210
- <td align="right">'+complete_uniq.to_s+'</td>
211
- <td align="right">'+'%.2f' % (100*complete_uniq.to_f/seqs_number1.to_f).to_s+' %</td>
212
- </tr>'
213
- stats_file.puts ' <tr>
214
- <td align="left">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;Misassembled</td>
215
- <td align="right">'+error_1_num.to_s+'</td>
216
- <td align="right">'+'%.2f' % (100*error_1_num.to_f/seqs_number1.to_f).to_s+' %</td>
217
- </tr>'
218
- stats_file.puts ' <tr>
219
- <td align="left">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;Putative chimera</td>
220
- <td align="right">'+chimera_total.to_s+'</td>
221
- <td align="right">'+'%.2f' % (100*chimera_total.to_f/seqs_number1.to_f).to_s+' %</td>
222
- </tr>'
223
- end
224
- stats_file.puts ' <tr>
225
- <td align="left">Without orthologue <sup>1</sup></td>
226
- <td align="right">'+no_db.to_s+'</td>
227
- <td align="right">'+'%.2f' % (100*seqs_number2.to_f/total_seqs.to_f).to_s+' %</td>
228
- </tr>'
229
-
230
- if (no_db.to_i > 0) && (seqs_number2.to_i > 0)
231
- stats_file.puts ' <tr>
232
- <td align="left">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;Coding (all)</td>
233
- <td align="right">'+tcode_array[0][0].to_s+'</td>
234
- <td align="right">'+'%.2f' % (100*tcode_array[0][0].to_f/no_db.to_f).to_s+' %</td>
235
- </tr>'
236
- stats_file.puts ' <tr>
237
- <td align="left">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;Coding > '+size_filter1.to_s+'bp</td>
238
- <td align="right">'+tcode_array[0][2].to_s+'</td>
239
- <td align="right">'+'%.2f' % (100*tcode_array[0][2].to_f/no_db.to_f).to_s+' %</td>
240
- </tr>'
241
- stats_file.puts ' <tr>
242
- <td align="left">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;Coding > '+size_filter2.to_s+'bp</td>
243
- <td align="right">'+tcode_array[0][3].to_s+'</td>
244
- <td align="right">'+'%.2f' % (100*tcode_array[0][3].to_f/no_db.to_f).to_s+' %</td>
245
- </tr>'
246
- stats_file.puts ' <tr>
247
- <td align="left">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;Putative Coding (all)</td>
248
- <td align="right">'+tcode_array[1][0].to_s+'</td>
249
- <td align="right">'+'%.2f' % (100*tcode_array[1][0].to_f/no_db.to_f).to_s+' %</td>
250
- </tr>'
251
- stats_file.puts ' <tr>
252
- <td align="left">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;Putative Coding > '+size_filter1.to_s+'bp</td>
253
- <td align="right">'+tcode_array[1][2].to_s+'</td>
254
- <td align="right">'+'%.2f' % (100*tcode_array[1][2].to_f/no_db.to_f).to_s+' %</td>
255
- </tr>'
256
- stats_file.puts ' <tr>
257
- <td align="left">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;Putative Coding > '+size_filter2.to_s+'bp</td>
258
- <td align="right">'+tcode_array[1][3].to_s+'</td>
259
- <td align="right">'+'%.2f' % (100*tcode_array[1][3].to_f/no_db.to_f).to_s+' %</td>
260
- </tr>'
261
- stats_file.puts ' <tr>
262
- <td align="left">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;Putative ncRNA</td>
263
- <td align="right">'+ncrna_total.to_s+'</td>
264
- <td align="right">'+'%.2f' % (100*ncrna_total.to_f/no_db.to_f).to_s+' %</td>
265
- </tr>'
266
- stats_file.puts ' <tr>
267
- <td align="left">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;Unknown (all)</td>
268
- <td align="right">'+tcode_array[2][0].to_s+'</td>
269
- <td align="right">'+'%.2f' % (100*tcode_array[2][0].to_f/no_db.to_f).to_s+' %</td>
270
- </tr>'
271
- stats_file.puts ' <tr>
272
- <td align="left">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;Unknown > '+size_filter1.to_s+'bp</td>
273
- <td align="right">'+tcode_array[2][2].to_s+'</td>
274
- <td align="right">'+'%.2f' % (100*tcode_array[2][2].to_f/no_db.to_f).to_s+' %</td>
275
- </tr>'
276
- stats_file.puts ' <tr>
277
- <td align="left">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;Unknown > '+size_filter2.to_s+'bp</td>
278
- <td align="right">'+tcode_array[2][3].to_s+'</td>
279
- <td align="right">'+'%.2f' % (100*tcode_array[2][3].to_f/no_db.to_f).to_s+' %</td>
280
- </tr>'
281
- end
282
- stats_file.puts ' </table>
283
- <sup>1</sup> Percents for subclassifications of this category were calculated using this line as 100% reference.'
284
-
83
+ def initialize_stats_hash_reptrans
84
+ stats_hash = {}
85
+ stats_hash['prot_annotated'] = 0
86
+ stats_hash['est_annotated'] = 0
87
+ stats_hash['coding_>1'] = 0
88
+ stats_hash['coding_>0.94'] = 0
89
+ stats_hash['coding_>0.84'] = 0
90
+ stats_hash['coding_>0.73'] = 0
91
+ stats_hash['coding_>0'] = 0
92
+ return stats_hash
93
+ end
94
+
95
+ def summary_stats(seqs, stats_hash, diff_ids_array, diff_ids_complete_array)
96
+ low_limit = 200
97
+ upper_limit = 500
98
+ #All seqs
99
+ #-----------
100
+ stats_hash['output_seqs'] += seqs.length
101
+ good_seqs = seqs.select{|s| s.type >= UNKNOWN}
102
+ stats_hash['good_seqs'] += good_seqs.length
103
+
104
+ #Longest_unigene
105
+ current_longest_unigene = seqs.map{|s| s.fasta_length}.max
106
+ if current_longest_unigene > stats_hash['longest_unigene']
107
+ stats_hash['longest_unigene'] = current_longest_unigene
285
108
  end
286
- stats_file.puts html_end
109
+
110
+ #Load ids
111
+ seqs.map{|s|
112
+ if s.type > UNKNOWN && s.type < NCRNA
113
+ diff_ids_array << s.hit.acc
114
+ end}
115
+ diff_ids_array.uniq!
116
+
117
+ #By Length
118
+ stats_hash['sequences_>200'] += good_seqs.select{|s| s.fasta_length > low_limit}.length
119
+ stats_hash['sequences_>500'] += good_seqs.select{|s| s.fasta_length > upper_limit}.length
120
+
121
+ stats_hash['failed'] += seqs.select{|s| s.type == FAILED}.length
122
+
123
+ #Unknown
124
+ #-----------------------------
125
+ all_unknown = seqs.select{|s| s.type == UNKNOWN}
126
+ stats_hash['unknown'] += all_unknown.length
127
+
128
+ #By Length
129
+ stats_hash['unknown_>200'] += all_unknown.select{|s| s.fasta_length > low_limit}.length
130
+ stats_hash['unknown_>500'] += all_unknown.select{|s| s.fasta_length > upper_limit}.length
131
+
132
+ #Artifacts
133
+ #----------------
134
+ stats_hash['artifacts'] += seqs.select{|s| s.type < UNKNOWN && s.type > FAILED}.length
135
+ stats_hash['misassembled'] += seqs.select{|s| s.type == MISASSEMBLED}.length
136
+ stats_hash['chimeras'] += seqs.select{|s| s.type == CHIMERA && !s.seq_name.include?('_split_')}.length # We don't want count a multiple chimera
137
+ stats_hash['other_artifacts'] += seqs.select{|s| s.type == OTHER}.length
287
138
 
288
- stats_file.close
139
+ #Annotated with prot
140
+ #---------------------
141
+ prot_annotated = seqs.select{|s| s.type >= COMPLETE && s.type <= INTERNAL}
142
+ stats_hash['prot_annotated'] += prot_annotated.length
143
+
144
+ #By annotation
145
+ stats_hash['internal'] += seqs.select{|s| s.type == INTERNAL}.length
146
+ complete = seqs.select{|s| s.type == COMPLETE}
147
+ n_terminal = seqs.select{|s| s.type == N_TERMINAL}
148
+ c_terminal = seqs.select{|s| s.type == C_TERMINAL}
149
+
150
+ stats_hash['complete'] += complete.length
151
+ stats_hash['n_terminal'] += n_terminal.length
152
+ stats_hash['c_terminal'] += c_terminal.length
153
+
154
+ #Load complete ids
155
+ complete.map{|s| diff_ids_complete_array << s.hit.acc}
156
+ diff_ids_complete_array.uniq!
157
+
158
+ #----> By Status
159
+ stats_hash['complete_sure'] += complete.select{|s| s.status}.length
160
+ stats_hash['n_terminal_sure'] += n_terminal.select{|s| s.status}.length
161
+ stats_hash['c_terminal_sure'] += c_terminal.select{|s| s.status}.length
162
+ stats_hash['complete_putative'] += complete.select{|s| !s.status}.length
163
+ stats_hash['n_terminal_putative'] += n_terminal.select{|s| !s.status}.length
164
+ stats_hash['c_terminal_putative'] += c_terminal.select{|s| !s.status}.length
165
+
166
+ #By database
167
+ swissprot = prot_annotated.select{|s| s.db_name =~ /^sp_/}.length
168
+ trembl = prot_annotated.select{|s| s.db_name =~ /^tr_/}.length
169
+ stats_hash['swissprot'] += swissprot
170
+ stats_hash['trembl'] += trembl
171
+ stats_hash['userdb'] += prot_annotated.length - swissprot - trembl
172
+
173
+ #ncRNA
174
+ #----------------
175
+ stats_hash['ncrna'] += seqs.select{|s| s.type == NCRNA}.length
176
+
177
+ #Coding sequences
178
+ #----------------
179
+ coding = seqs.select{|s| s.type == CODING}
180
+ stats_hash['coding'] += coding.length
181
+
182
+ #By Status
183
+ stats_hash['coding_sure'] += coding.select{|s| s.status}.length
184
+ stats_hash['coding_putative'] += coding.select{|s| !s.status}.length
185
+
186
+ #By Length
187
+ stats_hash['coding_>200'] += coding.select{|s| s.fasta_length > low_limit}.length
188
+ stats_hash['coding_>500'] += coding.select{|s| s.fasta_length > upper_limit}.length
189
+
190
+
191
+ return stats_hash, diff_ids_array, diff_ids_complete_array
289
192
  end
290
193
 
194
+ def last_stats(stats_hash, diff_ids_array, diff_ids_complete_array)
195
+ stats_hash['different_orthologues'] = diff_ids_array.length
196
+ stats_hash['different_completes'] = diff_ids_complete_array.length
197
+ #BA index
198
+ if stats_hash['prot_annotated'] > 0 &&
199
+ stats_hash['complete'] > 0 &&
200
+ stats_hash['sequences_>500'] > 0 &&
201
+ stats_hash['different_orthologues'] > 0 &&
202
+ stats_hash['different_completes'] > 0
203
+ coef_anot_geom = (stats_hash['prot_annotated'] * stats_hash['complete'] * 1.0)/(stats_hash['sequences_>500']*10000)
204
+ coef_mejora = (stats_hash['different_orthologues']*1.0 + stats_hash['different_completes'])/(stats_hash['prot_annotated'] + stats_hash['complete'])
205
+ stats_hash['BA_index'] = Math.sqrt(coef_anot_geom*coef_mejora)
206
+ end
207
+
208
+ return stats_hash
209
+ end
291
210
 
211
+ def coding_stats_reptrans(coding_seq, stats_hash)
212
+ group = nil
213
+ if coding_seq.t_code > 1
214
+ group = 'coding_>1'
215
+ elsif coding_seq.t_code > 0.95
216
+ group = 'coding_>0.94'
217
+ elsif coding_seq.t_code > 0.85
218
+ group = 'coding_>0.84'
219
+ elsif coding_seq.t_code > 0.73
220
+ group = 'coding_>0.73'
221
+ elsif coding_seq.t_code > 0
222
+ group = 'coding_>0'
223
+ end
224
+ if !group.nil?
225
+ stats_hash[group] += 1
226
+ end
227
+ end
292
228
 
229
+ def write_summary_stats(stats_hash, stats_taxonomy, diff_ids_array, diff_ids_complete_array, txt_file, html_file)
230
+ stats_hash = last_stats(stats_hash, diff_ids_array, diff_ids_complete_array)
231
+ write_txt(stats_hash, txt_file)
232
+ write_html(stats_hash, html_file, stats_taxonomy)
233
+ end
293
234
 
294
- def html_code
295
- html_head = '<html>
296
- <head>
297
- <title>FLN Summary</title>
298
- </head>
299
-
300
- <body bgcolor="#FFFFFF">
301
- <center>
302
- <h1 align="center">
303
- Full-LengtherNEXT Summary
304
- </h1>'
305
-
306
-
307
- html_1 = '
308
- <h2 align="center">
309
- Status report
310
- </h2>
311
-
312
- <table border="2" cellspacing="0" cellpadding="2">
313
- <tr>
314
- <th colspan="2">Status</th>
315
- <th>Unigenes</th>
316
- <th>%</th>
317
- </tr>'
318
-
319
- html_2= '
320
- <h2 align="center">
321
- Unigene report
322
- </h2>
323
-
324
- <table border="2" cellspacing="0" cellpadding="2">
325
- <tr>
326
- <th></th>
327
- <th>Unigenes</th>
328
- <th>%</th>
329
- </tr>'
330
-
331
- html_3= '
332
- <h2 align="center">
333
- Database usage
334
- </h2>
335
-
336
- <table border="2" cellspacing="0" cellpadding="2">
337
- <tr>
338
- <th></th>
339
- <th>Unigenes</th>
340
- <th>%</th>
341
- </tr>'
342
-
343
- html_4= '
344
- <h2 align="center">
345
- Report guiding assembly quality
346
- </h2>
347
-
348
- <table border="2" cellspacing="0" cellpadding="2">
349
- <tr>
350
- <th></th>
351
- <th>Unigenes</th>
352
- <th>%</th>
353
- </tr>'
354
-
355
- html_5 = ' </body>
356
- </html>'
357
-
358
- return [html_head, html_1, html_2, html_3, html_4, html_5]
235
+ def write_reptrans_stats(stats_hash, html_file, txt_file)
236
+ html = File.open(html_file,'w')
237
+ txt = File.open(txt_file,'w')
238
+ write_txt(stats_hash, txt)
239
+ write_html_reptrans(stats_hash, html)
240
+ end
241
+
242
+ def write_html_reptrans(stats_hash, html_file)
243
+ html_file.puts '<html>'
244
+ header(html_file)
245
+ body_reptrans(html_file, stats_hash)
246
+ html_file.puts '</html>'
247
+ end
359
248
 
249
+ def write_txt(stats_hash, file)
250
+ stats_hash.each do |key, value|
251
+ file.puts "#{value}\t#{key}"
360
252
  end
253
+ end
361
254
 
255
+ def write_html(stats_hash, html_file, stats_taxonomy)
256
+ js_path = File.dirname(html_file.to_path)
257
+ system("unzip -qq #{File.join(File.dirname(__FILE__), '..', '..', 'expresscanvas.zip')} -d #{js_path}") if !File.exists?(File.join(js_path, 'expresscanvas'))
258
+ html_file.puts '<html>'
259
+ html_header(html_file, stats_hash, stats_taxonomy)
260
+ body(html_file, stats_hash)
261
+ html_file.puts '</html>'
262
+ end
362
263
 
363
- def annotation_stats(size_filter1,size_filter2)
364
-
365
- seqs_number = 0
366
- array_of_all_accs = []
367
- array_of_complete_accs = []
368
- error_1_num = 0
369
- uni_500 = 0
370
- uni_200 = 0
371
- longest_one = 0
372
-
373
- status_array = []
374
- # total, status
375
- complete = [0,'Complete']
376
- putative_complete = [0,'Putative Complete']
377
- c_terminus = [0,'C-terminus']
378
- putative_c_terminus = [0,'Putative C-terminus']
379
- n_terminus = [0,'N-terminus']
380
- putative_n_terminus = [0,'Putative N-terminus']
381
- internal = [0,'Internal']
382
- cod_seq = [0,'Misassembled']
383
-
384
- #userdb, SwissProt, TrEMBL
385
- db_usage = [0,0,0]
386
-
387
- File.open('fln_results/dbannotated.txt').each do |line|
388
- line.chomp!
389
- (name,fasta_length,acc,db_name,status,kk1,kk2,kk3,kk4,kk5,msgs) = line.split("\t")
390
-
391
- if (line !~ /^Query_id\t/) && (!line.empty?)
392
- seqs_number += 1
393
- if (fasta_length.to_i > longest_one)
394
- longest_one = fasta_length.to_i
395
- end
396
- array_of_all_accs.push acc
397
-
398
- if (db_name !~ /^sp_/) && (db_name !~ /^tr_/)
399
- db_usage[0] += 1
400
- elsif (db_name =~ /^sp_/)
401
- db_usage[1] += 1
402
- elsif (db_name =~ /^tr_/)
403
- db_usage[2] += 1
404
- end
405
-
406
- # -------------------------------------------------------------------------
407
- if (fasta_length.to_i >= size_filter1)
408
- uni_200 += 1
409
- end
410
- if (fasta_length.to_i >= size_filter2)
411
- uni_500 += 1
412
- end
413
- # -------------------------------------------------------------------------
414
- if (msgs =~ /ERROR#1/)
415
- error_1_num += 1
416
- end
417
- # -------------------------------------------------------------------------
418
- if (status == 'Complete')
419
- complete[0] += 1
420
- array_of_complete_accs.push acc
421
-
422
- elsif (status == 'Putative Complete')
423
- putative_complete[0] += 1
424
- elsif (status == 'C-terminus')
425
- c_terminus[0] += 1
426
- elsif (status == 'N-terminus')
427
- n_terminus[0] += 1
428
- elsif (status == 'Putative C-terminus')
429
- putative_c_terminus[0] += 1
430
- elsif (status == 'Putative N-terminus')
431
- putative_n_terminus[0] += 1
432
- elsif (status == 'Internal')
433
- internal[0] += 1
434
- elsif (status == 'Misassembled')
435
- cod_seq[0] += 1
436
- end
437
- # -------------------------------------------------------------------------
438
- end
264
+ def header(html_file)
265
+ html_file.puts '<head>',
266
+ '<title>FLN Summary</title>',
267
+ '</head>'
268
+ end
439
269
 
440
- end
270
+ def html_header(html_file, stats_hash, stats_taxonomy)
271
+ structural_data_sure = []
272
+ structural_data_sure << stats_hash['unknown']
273
+ structural_data_sure << stats_hash['complete_sure']
274
+ structural_data_sure << stats_hash['n_terminal_sure']
275
+ structural_data_sure << stats_hash['c_terminal_sure']
276
+ structural_data_sure << stats_hash['internal']
277
+ structural_data_sure << stats_hash['ncrna']
278
+ structural_data_sure << stats_hash['coding']
279
+
280
+ structural_data_putative = []
281
+ structural_data_putative << 0
282
+ structural_data_putative << stats_hash['complete_putative']
283
+ structural_data_putative << stats_hash['n_terminal_putative']
284
+ structural_data_putative << stats_hash['c_terminal_putative']
285
+ structural_data_putative << 0
286
+ structural_data_putative << 0
287
+ structural_data_putative << stats_hash['coding_putative']
288
+
289
+ values_structural_sure = "[#{structural_data_sure.map{|stat| stat*100.0/stats_hash['good_seqs']}.join(', ')}]"
290
+ values_structural_putative = "[#{structural_data_putative.map{|stat| stat*100.0/stats_hash['good_seqs']}.join(', ')}]"
291
+
292
+ data = stats_taxonomy.to_a.sort{|s2, s1| s1.last <=> s2.last}[0..20]
293
+ smps_taxonomy = "['#{data.map{|tax| tax.first}.join("', '")}']"
294
+ values_taxonomy = "[#{data.map{|tax| tax.last}.join(', ')}]"
295
+
296
+ html_file.puts '<head>
297
+ <title>FLN Summary</title>
298
+ <meta http-equiv="CACHE-CONTROL" CONTENT="NO-CACHE">
299
+ <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
441
300
 
442
- status_array = [complete, putative_complete, c_terminus, putative_c_terminus, n_terminus, putative_n_terminus, internal, cod_seq]
301
+ <!--[if lt IE 9]><script type="text/javascript" src="./expresscanvas/js/flashcanvas.js"></script><![endif]-->
302
+ <script type="text/javascript" src="./expresscanvas/js/canvasXpress.min.js"></script>
443
303
 
444
- return [status_array, db_usage, seqs_number, error_1_num, array_of_all_accs.uniq.count, array_of_complete_accs.uniq.count, uni_500, uni_200, longest_one]
445
- end
304
+ <script id=\'demoScript\'>
305
+ var showDemo = function () {'
446
306
 
307
+ #'smpTitle': 'Status',
308
+
309
+ html_file.puts "new CanvasXpress('profile',
310
+ {
311
+ 'y' : {
312
+ 'vars' : ['Sure', 'Putative'],
313
+ 'smps' : ['Unknown', 'Complete', 'N-terminal', 'C-terminal', 'Internal', 'ncrna', 'Coding'],
314
+ 'data' : [#{values_structural_sure},
315
+ #{values_structural_putative}],
316
+ },
317
+ 'a' : {
318
+ 'xAxis' : ['Sure', 'Putative']
319
+ },
320
+ },
321
+
322
+ {'gradient': false,
323
+ 'toolbarPermanent': true,
324
+ 'graphOrientation': 'vertical',
325
+ 'graphType': 'Stacked',
326
+ 'legendBackgroundColor': false,
327
+ 'smpLabelScaleFontFactor': 0.8,
328
+ 'xAxisTitle': '% sequences',
329
+ 'xAxis2Show': false,
330
+ 'xAxisExact': true,
331
+ 'setMaxX': 80,
332
+ 'setMinX': 0,
333
+ 'axisTitleScaleFontFactor': 2,
334
+ 'smpTitleFontStyle': 'italic',
335
+ 'titleHeight': 60
336
+ }
337
+ );
338
+
339
+ new CanvasXpress('taxonomy',
340
+ {
341
+ 'y' : {
342
+ 'vars' : ['Annotations'],
343
+ 'smps' : #{smps_taxonomy},
344
+ 'data' : [#{values_taxonomy}],
345
+ },
346
+ 'a' : {
347
+ 'xAxis' : ['Sure', 'Putative']
348
+ },
349
+ },
350
+
351
+ {'gradient': false,
352
+ 'toolbarPermanent': true,
353
+ 'graphOrientation': 'horizontal',
354
+ 'showLegend': false,
355
+ 'smpLabelScaleFontFactor': 1.5,
356
+ 'xAxisTitle': 'Number of sequences',
357
+ 'xAxis2Show': false,
358
+ 'titleHeight': 60
359
+ }
360
+ );
361
+ }
362
+ </script>
363
+ </head>"
364
+
365
+ end
447
366
 
448
- def testcode_stats(size_filter1,size_filter2)
449
-
450
- seqs_number = 0
451
- uni_500 = 0
452
- uni_200 = 0
453
- longest_one = 0
454
-
455
- # total, status
456
- coding_stats = [0,'Coding',0,0]
457
- p_coding_stats = [0,'Putative Coding',0,0]
458
- unknown_stats = [0,'Unknown',0,0]
459
-
460
- File.open('fln_results/new_coding.txt').each do |line|
461
- line.chomp!
462
- (name,fasta_length,acc,db_name,status) = line.split("\t")
463
-
464
- if (line !~ /^Query_id\t/) && (!line.empty?)
465
- seqs_number += 1
466
-
467
- if (fasta_length.to_i > longest_one)
468
- longest_one = fasta_length.to_i
469
- end
470
-
471
- # -------------------------------------------------------------------------
472
- if (fasta_length.to_i >= size_filter1)
473
- uni_200 += 1
474
- end
475
- if (fasta_length.to_i >= size_filter2)
476
- uni_500 += 1
477
- end
478
- # -------------------------------------------------------------------------
479
-
480
- if (fasta_length.to_i > size_filter1)
481
- if (status == 'coding')
482
- coding_stats[2] += 1
483
- elsif (status == 'putative_coding')
484
- p_coding_stats[2] += 1
485
- elsif (status == 'unknown')
486
- unknown_stats[2] += 1
487
- end
488
- end
489
-
490
- if (fasta_length.to_i > size_filter2)
491
- if (status == 'coding')
492
- coding_stats[3] += 1
493
- elsif (status == 'putative_coding')
494
- p_coding_stats[3] += 1
495
- elsif (status == 'unknown')
496
- unknown_stats[3] += 1
497
- end
498
- end
499
-
500
- if (status == 'coding')
501
- coding_stats[0] += 1
502
- elsif (status == 'putative_coding')
503
- p_coding_stats[0] += 1
504
- elsif (status == 'unknown')
505
- unknown_stats[0] += 1
506
- end
367
+ def body_reptrans(html_file, stats_hash)
368
+ html_file.puts '<body bgcolor="#FFFFFF" >', '<center>' # Start body
369
+ html_file.puts '<div style="float:center; font-size:30; margin:10px"><b>', 'Full-LengtherNEXT Representative Transcriptome Summary', '</b></div>'
370
+ # TABLES
371
+ html_file.puts '<div style=" width: 850px; height: 350px; padding: 10 ">'
372
+ reptrans_report(html_file, stats_hash, 'left')
373
+ reptrans_acumulative_report(html_file, stats_hash, 'rigth')
374
+ html_file.puts '</div>'
375
+ # END TABLES
376
+ html_file.puts '</center>', '</body>' # End body
377
+ end
507
378
 
508
- end
379
+ def body(html_file, stats_hash)
380
+ html_file.puts '<body bgcolor="#FFFFFF" onload="showDemo(); id=demo">', '<center>' # Start body
381
+ html_file.puts '<div style="float:center; font-size:30; margin:10px"><b>', 'Full-LengtherNEXT Summary', '</b></div>'
382
+
383
+ # TABLES
384
+ html_file.puts '<div style="overflow: hidden; width: 950px; height: 550px; padding: 10 ">'
385
+ general_report(html_file, stats_hash, 'left')
386
+ assembly_report(html_file, stats_hash, 'right')
387
+ html_file.puts '</div>'
388
+ html_file.puts '<div style="overflow: hidden; width: 950px; height: 550px; padding: 10 ">'
389
+ status_graph(html_file, 'left')
390
+ status_report(html_file, stats_hash, 'rigth')
391
+ html_file.puts '</div>'
392
+ html_file.puts '<div style="overflow: hidden; width: 950px; height: 750px; padding: 10 ">'
393
+ taxonomy_graph(html_file, 'left')
394
+ database_report(html_file, stats_hash, 'rigth')
395
+ html_file.puts '</div>'
396
+ # END TABLES
397
+ html_file.puts '</center>', '</body>' # End body
398
+ end
509
399
 
510
- end
511
400
 
512
- status_array = [coding_stats, p_coding_stats, unknown_stats]
513
401
 
514
- return [status_array, seqs_number, uni_500, uni_200, longest_one]
515
- end
402
+ def reptrans_report(html_file, stats_hash, align)
403
+ html = []
404
+ all_seqs = 0
405
+ stats_hash.values.map{|v| all_seqs += v}
406
+ html << '<div style=" margin: 0; float:'+align+'">'
407
+ html << table_title('Sequences info')
408
+ html.concat(table_header(['', 'Sequences', '%'], 0))
409
+ html.concat(single_row('Output', all_seqs, all_seqs))
410
+ html.concat(single_row('Annotated with protein', stats_hash['prot_annotated'], all_seqs))
411
+ html.concat(single_row('Annotated with EST', stats_hash['est_annotated'], all_seqs))
412
+ html.concat(single_row('Coding test-code > 1', stats_hash['coding_>1'], all_seqs))
413
+ html.concat(single_row('Coding test-code > 0.94', stats_hash['coding_>0.94'], all_seqs))
414
+ html.concat(single_row('Coding test-code > 0.84', stats_hash['coding_>0.84'], all_seqs))
415
+ html.concat(single_row('Coding test-code > 0.73', stats_hash['coding_>0.73'], all_seqs))
416
+ html.concat(single_row('Coding test-code > 0', stats_hash['coding_>0'], all_seqs))
417
+ html << '</table>'
418
+ html << '</div>'
419
+ write_array_html(html, html_file)
420
+ end
516
421
 
422
+ def reptrans_acumulative_report(html_file, stats_hash, align)
423
+ html = []
424
+ all_seqs = 0
425
+ stats_hash.values.map{|v| all_seqs += v}
426
+ html << '<div style=" margin: 0; float:'+align+'">'
427
+ html << table_title('Sequences summary (Acumulative)')
428
+ html.concat(table_header(['', 'Sequences', '%'], 0))
429
+ acumulative = 0
430
+ html.concat(single_row('Annotated with protein', stats_hash['prot_annotated'], all_seqs))
431
+ acumulative += stats_hash['prot_annotated']
432
+ html.concat(single_row('Annotated with EST', stats_hash['est_annotated'] + acumulative, all_seqs))
433
+ acumulative += stats_hash['est_annotated']
434
+ html.concat(single_row('Coding test-code > 1', stats_hash['coding_>1'] + acumulative, all_seqs))
435
+ acumulative += stats_hash['coding_>1']
436
+ html.concat(single_row('Coding test-code > 0.94', stats_hash['coding_>0.94'] + acumulative, all_seqs))
437
+ acumulative += stats_hash['coding_>0.94']
438
+ html.concat(single_row('Coding test-code > 0.84', stats_hash['coding_>0.84'] + acumulative, all_seqs))
439
+ acumulative += stats_hash['coding_>0.84']
440
+ html.concat(single_row('Coding test-code > 0.73', stats_hash['coding_>0.73'] + acumulative, all_seqs))
441
+ html << '</table>'
442
+ html << '</div>'
443
+ write_array_html(html, html_file)
444
+ end
445
+
446
+ def general_report(html_file, stats_hash, align)
447
+ html = []
448
+ html << '<div style="margin: 0; float:'+align+'">'
449
+ html << table_title('General info')
450
+ html.concat(table_header(['', 'Sequences', '%'], 0))
451
+ html.concat(single_row('Input', stats_hash['input_seqs'], stats_hash['input_seqs']))
452
+ html.concat(single_row('Failing sequences', stats_hash['failed'], stats_hash['output_seqs']))
453
+ html.concat(single_row('Artifacts <sup>1</sup>', stats_hash['artifacts'], stats_hash['output_seqs']))
454
+ html.concat(single_row('Misassembled', stats_hash['misassembled'], stats_hash['artifacts'], TRUE))
455
+ html.concat(single_row('Chimeras', stats_hash['chimeras'], stats_hash['artifacts'], TRUE))
456
+ html.concat(single_row('Other', stats_hash['other_artifacts'], stats_hash['artifacts'], TRUE))
457
+ html.concat(single_row('Sequences with resolved chimeras', stats_hash['output_seqs'], stats_hash['input_seqs']))
458
+ html.concat(single_row('Sequences without artifacts', stats_hash['good_seqs'], stats_hash['output_seqs']))
459
+ html.concat(single_row('BA index', "%5.2f" % [stats_hash['BA_index']], nil)) if stats_hash['BA_index'] > 0
460
+ html << '</table>'
461
+ html << '</div>'
462
+ write_array_html(html, html_file)
463
+ end
517
464
 
518
- def ncrna_stats(size_filter1,size_filter2)
465
+ def taxonomy_graph(html_file, align)
466
+ html_file.puts '<div style=\'float:'+align+'\'>'
467
+ html_file.puts table_title('Taxonomy distribution on annotations')
468
+ html_file.puts '<table >
469
+ <tr>
470
+ <td>
471
+ <canvas id=\'taxonomy\' width=\'540\' height=\'640\'></canvas>
472
+ </td>
473
+ </tr>
474
+ </table>
475
+ </div>'
476
+
477
+ end
519
478
 
520
- uni_500 = 0
521
- uni_200 = 0
522
- nc_total = 0
523
- longest_one = 0
524
479
 
525
- File.open('fln_results/nc_rnas.txt').each do |line|
526
- line.chomp!
527
- (name,fasta_length,acc,db_name,status) = line.split("\t")
480
+ def database_report(html_file, stats_hash, align)
481
+ html = []
482
+ html << '<div style=" margin: 0 float:'+align+'">'
483
+ html << table_title('Database usage')
484
+ html.concat(table_header(['', 'Unigenes', '%'], 0))
485
+ html.concat(single_row('UserDB', stats_hash['userdb'], stats_hash['good_seqs']))
486
+ html.concat(single_row('SwissProt', stats_hash['swissprot'], stats_hash['good_seqs']))
487
+ html.concat(single_row('TrEMBL', stats_hash['trembl'], stats_hash['good_seqs']))
488
+ html.concat(single_row('ncRNA', stats_hash['ncrna'], stats_hash['good_seqs']))
489
+ html.concat(single_row('None', stats_hash['coding']+ stats_hash['unknown'], stats_hash['good_seqs']))
490
+ html.concat(single_row('Total', stats_hash['good_seqs'], stats_hash['good_seqs']))
491
+ html << '</table>'
492
+ html << '</div>'
493
+ write_array_html(html, html_file)
494
+ end
528
495
 
529
- if (status == 'Putative ncRNA')
496
+ def assembly_report(html_file, stats_hash, align)
497
+ html = []
498
+ html << '<div style=" margin: 0; float:'+align+'">'
499
+ html << table_title('Report guiding assembly quality')
500
+ html.concat(table_header(['', 'Unigenes', '%'], 0))
501
+ html.concat(single_row('Unigenes', stats_hash['good_seqs'], stats_hash['good_seqs']))
502
+ html.concat(single_row('Unigenes >500pb', stats_hash['sequences_>500'], stats_hash['good_seqs']))
503
+ html.concat(single_row('Unigenes >200pb', stats_hash['sequences_>200'], stats_hash['good_seqs']))
504
+ html.concat(single_row('Longest unigene', stats_hash['longest_unigene'], nil))
505
+ html.concat(single_row('With orthologue <sup>1</sup>', stats_hash['prot_annotated'], stats_hash['good_seqs']))
506
+ html.concat(single_row('Different orthologue IDs', stats_hash['different_orthologues'], stats_hash['prot_annotated'], TRUE))
507
+ html.concat(single_row('Complete transcripts', stats_hash['complete'], stats_hash['prot_annotated'], TRUE))
508
+ html.concat(single_row('Different complete transcripts ', stats_hash['different_completes'], stats_hash['prot_annotated'], TRUE))
509
+ html.concat(single_row('ncRNA', stats_hash['ncrna'], stats_hash['good_seqs']))
510
+ without_orthologue = stats_hash['coding']+ stats_hash['unknown']
511
+ html.concat(single_row('Without orthologue <sup>1</sup>', without_orthologue, stats_hash['good_seqs']))
512
+ html.concat(single_row('Coding (all)', stats_hash['coding'], without_orthologue, TRUE))
513
+ html.concat(single_row('Coding > 200bp', stats_hash['coding_>200'], without_orthologue, TRUE))
514
+ html.concat(single_row('Coding > 500bp', stats_hash['coding_>500'], without_orthologue, TRUE))
515
+ html.concat(single_row('Unknown (all)', stats_hash['unknown'], without_orthologue, TRUE))
516
+ html.concat(single_row('Unknown > 200bp', stats_hash['unknown_>200'], without_orthologue, TRUE))
517
+ html.concat(single_row('Unknown > 500bp', stats_hash['unknown_>500'], without_orthologue, TRUE))
518
+ html << '</table>'
519
+ html << '<sup>1</sup> Percents for subclassifications of this category <br> were calculated using this line as 100% reference.'
520
+ html << '</div>'
521
+ write_array_html(html, html_file)
522
+ end
530
523
 
531
- if (fasta_length.to_i > longest_one)
532
- longest_one = fasta_length.to_i
533
- end
534
- # -------------------------------------------------------------------------
535
- if (fasta_length.to_i >= size_filter1)
536
- uni_200 += 1
537
- end
538
- if (fasta_length.to_i >= size_filter2)
539
- uni_500 += 1
540
- end
541
- # -------------------------------------------------------------------------
524
+ def status_graph(html_file, align)
525
+ html_file.puts '<div style=\'float:'+align+'\'>'
526
+ html_file.puts table_title('Structural profile')
527
+ html_file.puts '<table >
528
+ <tr>
529
+ <td>
530
+ <canvas id=\'profile\' width=\'500\' height=\'440\'></canvas>
531
+ </td>
532
+ </tr>
533
+ </table>
534
+ </div>'
542
535
 
543
- nc_total += 1
536
+ end
544
537
 
545
- end
538
+ def status_report(html_file, stats_hash, align)
539
+ html = []
540
+ html << '<div style=" margin: 0; float:'+align+'">'
541
+ html << table_title('Status report')
542
+ html.concat(table_header(['Status', 'Unigenes', '%'], 2))
543
+ html.concat(fused_row('Complete', stats_hash['complete_sure'], stats_hash['complete_putative'], stats_hash['good_seqs']))
544
+ html.concat(fused_row('C-terminus', stats_hash['c_terminal_sure'], stats_hash['c_terminal_putative'], stats_hash['good_seqs']))
545
+ html.concat(fused_row('N-terminus', stats_hash['n_terminal_sure'], stats_hash['n_terminal_putative'], stats_hash['good_seqs']))
546
+ html.concat(composed_single_row('Internal', stats_hash['internal'], stats_hash['good_seqs']))
547
+ html.concat(fused_row('Coding', stats_hash['coding_sure'], stats_hash['coding_putative'], stats_hash['good_seqs']))
548
+ html.concat(composed_single_row('ncRNA', stats_hash['ncrna'], stats_hash['good_seqs']))
549
+ html.concat(composed_single_row('Unknown', stats_hash['unknown'], stats_hash['good_seqs']))
550
+ html.concat(composed_single_row('Total', stats_hash['good_seqs'], stats_hash['good_seqs']))
551
+ html << '</table>'
552
+ html << '</div>'
553
+ write_array_html(html, html_file)
554
+ end
555
+
556
+
557
+ def table_title(title)
558
+ html = '<div style="font-size:25px; margin: 10"><b>'+title+'</b></div>'
559
+ return html
560
+ end
561
+
562
+ def table_header(col_array, colspan)
563
+ html = []
564
+
565
+ html << '<table border="2" cellspacing="0" cellpadding="2">'
566
+ # Table header
567
+ html << '<tr>'
568
+ col_array.each_with_index do |col,i|
569
+ if i == 0 && colspan > 0
570
+ html << '<th colspan="'+colspan.to_s+'">'+col+'</th>'
571
+ else
572
+ html << '<th>'+col+'</th>'
546
573
  end
574
+ end
575
+ html << '</tr>'
576
+ return html
577
+ end
547
578
 
548
- return [nc_total, uni_500, uni_200, longest_one]
579
+ def single_row(name, magnitude, total, space = FALSE)
580
+ if space
581
+ name = '&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;'+ name
549
582
  end
583
+ html = []
584
+ html << '<tr>'
585
+ html << '<td align="left">'+name+'</td>'
586
+ html.concat(sub_row(magnitude, total))
587
+ html << '</tr>'
588
+ return html
589
+ end
550
590
 
551
- def chimera_stats(size_filter1,size_filter2)
552
-
553
- uni_500 = 0
554
- uni_200 = 0
555
- ch_total = 0
556
- longest_one = 0
557
- db_usage = [0,0,0]
558
-
559
- if !File.exists?('fln_results/chimeric_sequences.txt')
560
- return [0, 0, 0, longest_one, db_usage]
591
+
592
+ def fused_row(type, sure_magnitude, putative_magnitude, total)
593
+ html = []
594
+ html << '<td rowspan="2" align="left">'+type+'</td>'
595
+ html << seq_status('Sure')
596
+ html.concat(sub_row(sure_magnitude, total))
597
+ html << '</tr>'
598
+ html << '<tr>'
599
+ html << seq_status('Putative')
600
+ html.concat(sub_row(putative_magnitude, total))
601
+ html << '</tr>'
602
+ return html
603
+ end
604
+
605
+ def seq_status(status)
606
+ html = '<td align="left">'+status+'</td>'
607
+ return html
608
+ end
609
+
610
+ def sub_row(magnitude, total)
611
+ if !total.nil?
612
+ perc_float = magnitude*100.0/total
613
+ if !perc_float.nan?
614
+ percentage = '%.2f' % perc_float.to_s
615
+ percentage += '%'
561
616
  else
562
- File.open('fln_results/chimeric_sequences.txt').each do |line|
563
- line.chomp!
564
- if (!line.empty?)
565
- (name,fasta_length,acc,db_name,status) = line.split("\t")
566
- if (status == 'Putative chimera')
567
- if (fasta_length.to_i > longest_one)
568
- longest_one = fasta_length.to_i
569
- end
570
- # -------------------------------------------------------------------------
571
- if (fasta_length.to_i >= size_filter1)
572
- uni_200 += 1
573
- end
574
- if (fasta_length.to_i >= size_filter2)
575
- uni_500 += 1
576
- end
577
- # -------------------------------------------------------------------------
578
- if (db_name =~ /^sp_/)
579
- db_usage[1] += 1
580
- elsif (db_name =~ /^tr_/)
581
- db_usage[2] += 1
582
- else
583
- db_usage[0] += 1
584
- end
585
- # -------------------------------------------------------------------------
586
- ch_total += 1
587
- end
588
- end
589
- end
590
-
591
- db_usage.each_with_index do |db,i|
592
- db_usage[i] = db/2
593
- end
594
-
595
- return [(ch_total/2), (uni_500/2), (uni_200/2), longest_one, db_usage]
617
+ percentage ='-'
596
618
  end
597
- end
619
+ else
620
+ percentage = '-'
621
+ end
622
+ html = []
623
+ html << '<td align="right">'+magnitude.to_s+'</td>'
624
+ html << '<td align="right">'+percentage+'</td>'
625
+ return html
626
+ end
627
+
628
+ def composed_single_row(type, magnitude, total)
629
+ html = []
630
+ html << '<tr>'
631
+ html << '<td colspan="2" align="left">'+type+'</td>'
632
+ html.concat(sub_row(magnitude, total))
633
+ html << '</tr>'
634
+ return html
635
+ end
598
636
 
599
637
 
638
+ def write_array_html(html, html_file)
639
+ html.map{|line| html_file.puts line}
640
+ end
600
641
  end