full_lengther_next 0.6.2 → 0.9.8

Sign up to get free protection for your applications and to get access to all the features.
Files changed (59) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +9 -0
  3. data/.rspec +2 -0
  4. data/.travis.yml +5 -0
  5. data/CODE_OF_CONDUCT.md +49 -0
  6. data/Gemfile +4 -0
  7. data/LICENSE.txt +21 -0
  8. data/{README.rdoc → README.md} +0 -0
  9. data/Rakefile +6 -37
  10. data/bin/console +14 -0
  11. data/bin/download_fln_dbs.rb +2 -7
  12. data/bin/full_lengther_next +85 -6
  13. data/bin/make_user_db.rb +13 -5
  14. data/bin/setup +8 -0
  15. data/full_lengther_next.gemspec +42 -0
  16. data/lib/full_lengther_next.rb +2 -10
  17. data/lib/full_lengther_next/artifacts.rb +74 -0
  18. data/lib/full_lengther_next/{classes/blast_functions.rb → blast_functions.rb} +0 -0
  19. data/lib/full_lengther_next/{classes/cdhit.rb → cdhit.rb} +0 -0
  20. data/lib/full_lengther_next/{classes/chimeric_seqs.rb → chimeric_seqs.rb} +0 -0
  21. data/lib/full_lengther_next/{classes/common_functions.rb → common_functions.rb} +0 -0
  22. data/lib/full_lengther_next/{classes/exonerate_result.rb → exonerate_result.rb} +0 -0
  23. data/lib/full_lengther_next/{classes/fl_analysis.rb → fl_analysis.rb} +0 -0
  24. data/lib/full_lengther_next/{classes/fl_string_utils.rb → fl_string_utils.rb} +0 -0
  25. data/lib/full_lengther_next/fln_stats.rb +613 -0
  26. data/lib/full_lengther_next/go_methods.rb +42 -0
  27. data/lib/full_lengther_next/{classes/handle_db.rb → handle_db.rb} +0 -0
  28. data/lib/full_lengther_next/mapping.rb +296 -0
  29. data/lib/full_lengther_next/{classes/my_worker.rb → my_worker.rb} +71 -9
  30. data/lib/full_lengther_next/{classes/my_worker_EST.rb → my_worker_EST.rb} +0 -0
  31. data/lib/full_lengther_next/{classes/my_worker_manager_EST.rb → my_worker_manager_EST.rb} +0 -0
  32. data/lib/full_lengther_next/{classes/my_worker_manager_fln.rb → my_worker_manager_fln.rb} +181 -16
  33. data/lib/full_lengther_next/{classes/nc_rna.rb → nc_rna.rb} +0 -0
  34. data/lib/full_lengther_next/{classes/orf.rb → orf.rb} +0 -0
  35. data/lib/full_lengther_next/{classes/reptrans.rb → reptrans.rb} +9 -5
  36. data/lib/full_lengther_next/{classes/sequence.rb → sequence.rb} +26 -1
  37. data/lib/full_lengther_next/{classes/test_code.rb → test_code.rb} +1 -1
  38. data/lib/full_lengther_next/{classes/types.rb → types.rb} +3 -2
  39. data/lib/full_lengther_next/{classes/une_los_hit.rb → une_los_hit.rb} +0 -0
  40. data/lib/full_lengther_next/version.rb +3 -0
  41. data/lib/full_lengther_next/{classes/warnings.rb → warnings.rb} +0 -0
  42. data/report_templates/general_summary.erb +140 -0
  43. data/report_templates/mapping_summary.erb +98 -0
  44. data/report_templates/reptrans_summary.erb +32 -0
  45. metadata +112 -134
  46. data/.gemtest +0 -0
  47. data/History.txt +0 -32
  48. data/Manifest.txt +0 -44
  49. data/PostInstall.txt +0 -6
  50. data/bin/plot_fln.rb +0 -270
  51. data/bin/plot_taxonomy.rb +0 -70
  52. data/lib/expresscanvas.zip +0 -0
  53. data/lib/full_lengther_next/classes/artifacts.rb +0 -66
  54. data/lib/full_lengther_next/classes/fln_stats.rb +0 -641
  55. data/script/console +0 -10
  56. data/script/destroy +0 -14
  57. data/script/generate +0 -14
  58. data/test/test_full_lengther_next.rb +0 -11
  59. data/test/test_helper.rb +0 -3
@@ -1,13 +1,5 @@
1
- $:.unshift(File.dirname(__FILE__)) unless
2
- $:.include?(File.dirname(__FILE__)) || $:.include?(File.expand_path(File.dirname(__FILE__)))
3
-
4
- # ROOT_PATH=File.join(File.dirname(__FILE__),'full_lengther_next')
5
-
6
- $: << File.expand_path(File.join(File.dirname(__FILE__), 'full_lengther_next', 'classes'))
7
-
1
+ require "full_lengther_next/version"
8
2
 
9
3
  module FullLengtherNext
10
- VERSION = '0.6.2'
11
-
12
- FULL_LENGHTER_VERSION = VERSION
4
+ # Your code goes here...
13
5
  end
@@ -0,0 +1,74 @@
1
+ require 'blast_functions'
2
+ require 'types'
3
+ require 'chimeric_seqs'
4
+ include ChimericSeqs
5
+
6
+ #####################################################################
7
+ ## MAIN FUNCTION
8
+ #####################################################################
9
+ def artifact?(seq, query, db_name, db_path, options, new_seqs)
10
+ artifact = FALSE
11
+ # UNMAPPED CONTIG DETECTION
12
+ if query.nil? && seq.unmapped? #If seq is misassembled stop chimera analisys
13
+ seq.hit = nil
14
+ artifact = TRUE
15
+ seq.type = UNMAPPED
16
+ end
17
+
18
+ if !query.nil?
19
+ # MISASSEMBLED DETECTION
20
+ if !artifact && misassembled_detection(query) #If seq is misassembled stop chimera analisys
21
+ seq.hit = query.hits.first
22
+ artifact = TRUE
23
+ seq.type = MISASSEMBLED
24
+ seq.warnings('ERROR#1')
25
+ end
26
+
27
+ # OVERLAPPING HSPS ON SUBJECT DETECTION
28
+ =begin
29
+ if !artifact
30
+ hit_reference = query.hits.first.dup
31
+ query, overlapping = overlapping_hsps_on_subject(query)
32
+ if overlapping
33
+ if query.hits.first.nil?
34
+ seq.hit = hit_reference
35
+ else
36
+ seq.hit = query.hits.first
37
+ end
38
+ artifact = TRUE
39
+ seq.type = OTHER
40
+ seq.warnings('ERROR#2')
41
+ end
42
+ end
43
+ =end
44
+
45
+ # MULTIPLE HSP DETECTION
46
+ if !artifact && multiple_hsps(query, 3)
47
+ seq.hit = query.hits.first
48
+ seq.warnings('ERROR#3')
49
+ end
50
+
51
+ # CHIMERA DETECTION
52
+ if !artifact && !options[:chimera].include?('d')
53
+ chimera = search_chimeras(seq, query, options, db_name, db_path)
54
+ if !chimera.nil?
55
+ new_seqs.concat(chimera)
56
+ seq.db_name = db_name
57
+ seq.type = CHIMERA
58
+ artifact = TRUE
59
+ end
60
+ end
61
+ end
62
+ if artifact
63
+ if $verbose > 1
64
+ puts seq.prot_annot_calification
65
+ end
66
+ seq.db_name = db_name
67
+ seq.save_fasta = FALSE
68
+ seq.ignore = TRUE
69
+ end
70
+ return artifact
71
+ end
72
+
73
+
74
+
@@ -0,0 +1,613 @@
1
+ require 'report_html'
2
+ require 'types.rb'
3
+ require 'go_methods'
4
+
5
+ module FlnStats
6
+ REPORT_FOLDER = File.expand_path(File.join(File.dirname(__FILE__), '..', '..', 'report_templates'))
7
+ def initialize_stats_hash
8
+ stats_hash = {
9
+ 'input_seqs' => 0,
10
+ 'output_seqs' => 0,
11
+ 'failed' => 0,
12
+ 'full_transcriptome_length' => 0,
13
+ 'PRE_FLN_full_transcriptome_length' => 0,
14
+ 'mean_length' => 0,
15
+ 'PRE_FLN_mean_length' => 0,
16
+ 'indeterminations' => 0,
17
+ 'PRE_FLN_indeterminations' => 0,
18
+ 'gap_number' => 0,
19
+ 'PRE_FLN_gap_number' => 0,
20
+ 'indetermination_mean_length' => 0,
21
+ 'PRE_FLN_indetermination_mean_length' => 0,
22
+ 'sequences_>200' => 0,
23
+ 'sequences_>500' => 0,
24
+ 'PRE_FLN_sequences_>500' => 0,
25
+ 'longest_unigene' => 0,
26
+ 'n50' => 0,
27
+ 'PRE_FLN_n50' => 0,
28
+ 'n90' => 0,
29
+ 'PRE_FLN_n90' => 0,
30
+ 'good_seqs' => 0,
31
+ 'artifacts' => 0,
32
+ 'misassembled' => 0,
33
+ 'chimeras' => 0,
34
+ 'unmapped' => 0,
35
+ 'other_artifacts' => 0,
36
+ 'unknown' => 0,
37
+ 'unknown_>200' => 0,
38
+ 'unknown_>500' => 0,
39
+ 'prot_annotated' => 0,
40
+ 'complete' => 0,
41
+ 'complete_sure' => 0,
42
+ 'complete_putative' => 0,
43
+ 'n_terminal' => 0,
44
+ 'n_terminal_sure' => 0,
45
+ 'n_terminal_putative' => 0,
46
+ 'c_terminal' => 0,
47
+ 'c_terminal_sure' => 0,
48
+ 'c_terminal_putative' => 0,
49
+ 'internal' => 0,
50
+ 'swissprot' => 0,
51
+ 'trembl' => 0,
52
+ 'userdb' => 0,
53
+ 'ncrna' => 0,
54
+ 'coding' => 0,
55
+ 'coding_sure' => 0,
56
+ 'coding_putative' => 0,
57
+ 'coding_>200' => 0,
58
+ 'coding_>500' => 0,
59
+ 'different_orthologues' => 0,
60
+ 'different_completes' => 0,
61
+ 'BA_index' => 0
62
+ }
63
+
64
+ return stats_hash
65
+ end
66
+
67
+ def get_taxonomy(name, taxonomy)
68
+ organism = nil
69
+ if name.include?('OS=')
70
+ fields = name.split('OS=',2)
71
+ organism = fields.last.split(' GN=').first.strip
72
+ elsif name[0..2] = 'sp=' || name[0..2] = 'tr='
73
+ name =~ /(\w+ \w+) \(([\w ]+)\) \(([\w ]+)\)/
74
+ if !$1.nil?
75
+ organism = $1
76
+ else
77
+ name =~ /(\w+ \w+) \(([\w ]+)\)/
78
+ if !$1.nil?
79
+ organism = $1
80
+ end
81
+ end
82
+ else
83
+ organism = name.split(";",2).last
84
+ organism = organism.split('.', 2).first
85
+ organism.gsub!(/\(\D+\)/,'')
86
+ if organism.split(' ').length > 1
87
+ organism.gsub!('.','')
88
+ organism.gsub!(/^ /,'')
89
+ organism.gsub!(' ','')
90
+ organism.strip!
91
+ end
92
+ end
93
+ if !organism.nil?
94
+ organism = organism.split(' ')[0..1].join(' ')
95
+ if taxonomy[organism].nil?
96
+ taxonomy[organism] = 1
97
+ else
98
+ taxonomy[organism] += 1
99
+ end
100
+ end
101
+ end
102
+
103
+ def initialize_stats_hash_reptrans
104
+ stats_hash = {
105
+ 'prot_annotated' => 0,
106
+ 'est_annotated' => 0,
107
+ 'coding_>1' => 0,
108
+ 'coding_>0.94' => 0,
109
+ 'coding_>0.84' => 0,
110
+ 'coding_>0.73' => 0,
111
+ 'coding_>0' => 0
112
+ }
113
+ return stats_hash
114
+ end
115
+
116
+ # Extract sequence stats
117
+ ##################################################
118
+ def sequence_stats(seq, stats_hash)
119
+ nt_seq = seq.seq_fasta
120
+ stats_hash['input_seqs'] += 1
121
+ stats_hash['PRE_FLN_sequences_>500'] += 1 if nt_seq.length >= 500
122
+ stats_hash['PRE_FLN_full_transcriptome_length'] += nt_seq.length
123
+ stats_hash['PRE_FLN_indeterminations'] += (nt_seq.count('n') + nt_seq.count('N'))
124
+ stats_hash['PRE_FLN_gap_number'] += nt_seq.scan(/[nN]+/).length
125
+ end
126
+
127
+ # Build final stats
128
+ ####################################################
129
+ def summary_stats(seqs, stats_hash, diff_ids_array, diff_ids_complete_array, all_seq_lengths)
130
+ low_limit = 200
131
+ upper_limit = 500
132
+ #All seqs
133
+ #-----------
134
+ stats_hash['output_seqs'] += seqs.length
135
+ good_seqs = seqs.select{|s| s.type >= UNKNOWN}
136
+ stats_hash['good_seqs'] += good_seqs.length
137
+
138
+ #Indeterminations
139
+ if !good_seqs.empty?
140
+ stats_hash['indeterminations'] += good_seqs.map{|s| s.seq_fasta.count('n') + s.seq_fasta.count('N')}.inject { |sum, n| sum + n }
141
+ stats_hash['gap_number'] += good_seqs.map{|s| s.seq_fasta.scan(/[nN]+/).length}.inject { |sum, n| sum + n }
142
+ end
143
+
144
+ #Longest_unigene
145
+ current_longest_unigene = seqs.map{|s| s.fasta_length}.max
146
+ if current_longest_unigene > stats_hash['longest_unigene']
147
+ stats_hash['longest_unigene'] = current_longest_unigene
148
+ end
149
+
150
+ #Load ids
151
+ seqs.map{|s|
152
+ if s.type > UNKNOWN && s.type < NCRNA
153
+ diff_ids_array << s.hit.acc
154
+ end}
155
+ diff_ids_array.uniq!
156
+
157
+ #By Length
158
+ if !good_seqs.empty?
159
+ seq_lengths = good_seqs.map{|s| s.fasta_length }
160
+ all_seq_lengths.concat(seq_lengths)
161
+ stats_hash['full_transcriptome_length'] += seq_lengths.inject { |sum, n| sum + n }
162
+ stats_hash['sequences_>200'] += seq_lengths.select{|l| l > low_limit}.length
163
+ stats_hash['sequences_>500'] += seq_lengths.select{|l| l > upper_limit}.length
164
+ end
165
+
166
+ stats_hash['failed'] += seqs.select{|s| s.type == FAILED}.length
167
+
168
+ #Unknown
169
+ #-----------------------------
170
+ all_unknown = seqs.select{|s| s.type == UNKNOWN}
171
+ stats_hash['unknown'] += all_unknown.length
172
+
173
+ #By Length
174
+ stats_hash['unknown_>200'] += all_unknown.select{|s| s.fasta_length > low_limit}.length
175
+ stats_hash['unknown_>500'] += all_unknown.select{|s| s.fasta_length > upper_limit}.length
176
+
177
+ #Artifacts
178
+ #----------------
179
+ stats_hash['artifacts'] += seqs.select{|s| s.type < UNKNOWN && s.type > FAILED}.length
180
+ stats_hash['misassembled'] += seqs.select{|s| s.type == MISASSEMBLED}.length
181
+ stats_hash['unmapped'] += seqs.select{|s| s.type == UNMAPPED}.length
182
+ stats_hash['chimeras'] += seqs.select{|s| s.type == CHIMERA && !s.seq_name.include?('_split_')}.length # We don't want count a multiple chimera
183
+ stats_hash['other_artifacts'] += seqs.select{|s| s.type == OTHER}.length
184
+
185
+ #Annotated with prot
186
+ #---------------------
187
+ prot_annotated = seqs.select{|s| s.type >= COMPLETE && s.type <= INTERNAL}
188
+ stats_hash['prot_annotated'] += prot_annotated.length
189
+
190
+ #By annotation
191
+ stats_hash['internal'] += seqs.select{|s| s.type == INTERNAL}.length
192
+ complete = seqs.select{|s| s.type == COMPLETE}
193
+ n_terminal = seqs.select{|s| s.type == N_TERMINAL}
194
+ c_terminal = seqs.select{|s| s.type == C_TERMINAL}
195
+
196
+ stats_hash['complete'] += complete.length
197
+ stats_hash['n_terminal'] += n_terminal.length
198
+ stats_hash['c_terminal'] += c_terminal.length
199
+
200
+ #Load complete ids
201
+ complete.map{|s| diff_ids_complete_array << s.hit.acc}
202
+ diff_ids_complete_array.uniq!
203
+
204
+ #----> By Status
205
+ stats_hash['complete_sure'] += complete.select{|s| s.status}.length
206
+ stats_hash['n_terminal_sure'] += n_terminal.select{|s| s.status}.length
207
+ stats_hash['c_terminal_sure'] += c_terminal.select{|s| s.status}.length
208
+ stats_hash['complete_putative'] += complete.select{|s| !s.status}.length
209
+ stats_hash['n_terminal_putative'] += n_terminal.select{|s| !s.status}.length
210
+ stats_hash['c_terminal_putative'] += c_terminal.select{|s| !s.status}.length
211
+
212
+ #By database
213
+ swissprot = prot_annotated.select{|s| s.db_name =~ /^sp_/}.length
214
+ trembl = prot_annotated.select{|s| s.db_name =~ /^tr_/}.length
215
+ stats_hash['swissprot'] += swissprot
216
+ stats_hash['trembl'] += trembl
217
+ stats_hash['userdb'] += prot_annotated.length - swissprot - trembl
218
+
219
+ #ncRNA
220
+ #----------------
221
+ stats_hash['ncrna'] += seqs.select{|s| s.type == NCRNA}.length
222
+
223
+ #Coding sequences
224
+ #----------------
225
+ coding = seqs.select{|s| s.type == CODING}
226
+ stats_hash['coding'] += coding.length
227
+
228
+ #By Status
229
+ stats_hash['coding_sure'] += coding.select{|s| s.status}.length
230
+ stats_hash['coding_putative'] += coding.select{|s| !s.status}.length
231
+
232
+ #By Length
233
+ stats_hash['coding_>200'] += coding.select{|s| s.fasta_length > low_limit}.length
234
+ stats_hash['coding_>500'] += coding.select{|s| s.fasta_length > upper_limit}.length
235
+
236
+
237
+ return stats_hash, diff_ids_array, diff_ids_complete_array, all_seq_lengths
238
+ end
239
+
240
+ def calculate_n50_n90(stats_hash, f_tot_key, n50_key, n90_key, seq_lengths)
241
+ f_tot_lengths = stats_hash[f_tot_key].to_f
242
+ cum = 0
243
+ seq_lengths.sort!{|a, b| b <=> a}
244
+ seq_lengths.each do |length|
245
+ cum += length
246
+ if cum / f_tot_lengths > 0.5 && stats_hash[n50_key] == 0
247
+ stats_hash[n50_key] = length
248
+ elsif cum / f_tot_lengths > 0.9
249
+ stats_hash[n90_key] = length
250
+ break
251
+ end
252
+ end
253
+ end
254
+
255
+ def last_stats(stats_hash, diff_ids_array, diff_ids_complete_array, pre_fln_seq_lengths, seq_lengths)
256
+ stats_hash['different_orthologues'] = diff_ids_array.length
257
+ stats_hash['different_completes'] = diff_ids_complete_array.length
258
+ stats_hash['mean_length'] = stats_hash['full_transcriptome_length'].to_f / stats_hash['good_seqs'] if stats_hash['good_seqs'] > 0
259
+ stats_hash['indetermination_mean_length'] = stats_hash['indeterminations'].to_f / stats_hash['gap_number'] if stats_hash['gap_number'] > 0
260
+ stats_hash['PRE_FLN_mean_length'] = stats_hash['PRE_FLN_full_transcriptome_length'].to_f / stats_hash['input_seqs'] if stats_hash['input_seqs'] > 0
261
+ stats_hash['PRE_FLN_indetermination_mean_length'] = stats_hash['PRE_FLN_indeterminations'].to_f / stats_hash['PRE_FLN_gap_number'] if stats_hash['PRE_FLN_gap_number'] > 0
262
+
263
+ calculate_n50_n90(stats_hash, 'full_transcriptome_length', 'n50', 'n90', seq_lengths)
264
+ calculate_n50_n90(stats_hash, 'PRE_FLN_full_transcriptome_length', 'PRE_FLN_n50', 'PRE_FLN_n90', pre_fln_seq_lengths)
265
+
266
+ #BA index
267
+ if stats_hash['prot_annotated'] > 0 &&
268
+ stats_hash['complete'] > 0 &&
269
+ stats_hash['sequences_>500'] > 0 &&
270
+ stats_hash['different_orthologues'] > 0 &&
271
+ stats_hash['different_completes'] > 0
272
+ coef_anot_geom = (stats_hash['prot_annotated'] * stats_hash['complete'] * 1.0)/(stats_hash['sequences_>500']*10000)
273
+ coef_mejora = (stats_hash['different_orthologues']*1.0 + stats_hash['different_completes'])/(stats_hash['prot_annotated'] + stats_hash['complete'])
274
+ stats_hash['BA_index'] = Math.sqrt(coef_anot_geom*coef_mejora)
275
+ end
276
+
277
+ return stats_hash
278
+ end
279
+
280
+ def coding_stats_reptrans(coding_seq, stats_hash)
281
+ group = nil
282
+ if coding_seq.t_code > 1
283
+ group = 'coding_>1'
284
+ elsif coding_seq.t_code > 0.95
285
+ group = 'coding_>0.94'
286
+ elsif coding_seq.t_code > 0.85
287
+ group = 'coding_>0.84'
288
+ elsif coding_seq.t_code > 0.73
289
+ group = 'coding_>0.73'
290
+ elsif coding_seq.t_code > 0
291
+ group = 'coding_>0'
292
+ end
293
+ if !group.nil?
294
+ stats_hash[group] += 1
295
+ end
296
+ end
297
+
298
+ def handle_data_main_summary(stats_hash, stats_taxonomy, stats_functional_annotation_by_seqs)
299
+ container = {}
300
+
301
+ identation = '&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;'
302
+ # GENERAL REPORT TABLE
303
+ #-------------------------------------------------------
304
+ general_report = [
305
+ ['', 'Sequences', '%'],
306
+ ['Input', stats_hash['input_seqs']],
307
+ [identation + 'N50 (bp)', stats_hash['PRE_FLN_n50']],
308
+ [identation + 'N90 (bp)', stats_hash['PRE_FLN_n90']],
309
+ [identation + 'Full transcriptome length (bp)', stats_hash['PRE_FLN_full_transcriptome_length']],
310
+ [identation + 'Mean sequence length (bp)', '%.2f' % stats_hash['PRE_FLN_mean_length']],
311
+ [identation + 'Nucleotide indeterminations (bp)', stats_hash['PRE_FLN_indeterminations']],
312
+ [identation + 'Mean indetermination length (bp)', '%.2f' % stats_hash['PRE_FLN_indetermination_mean_length']],
313
+ [identation + 'Unigenes >500pb', stats_hash['PRE_FLN_sequences_>500']],
314
+ [identation + 'Failing sequences', stats_hash['failed']],
315
+ [identation + 'Artifacts <sup>1</sup>', stats_hash['artifacts']],
316
+ [identation*2 + 'Unmapped transcripts', stats_hash['unmapped']],
317
+ [identation*2 + 'Misassembled', stats_hash['misassembled']],
318
+ [identation*2 + 'Chimeras', stats_hash['chimeras']],
319
+ [identation*2 + 'Other', stats_hash['other_artifacts']],
320
+ ['Sequences with resolved chimeras', stats_hash['output_seqs']],
321
+ ['Sequences without artifacts', stats_hash['good_seqs']],
322
+ [identation + 'N50 (bp)', stats_hash['n50']],
323
+ [identation + 'N90 (bp)', stats_hash['n90']],
324
+ [identation + 'Full transcriptome length (bp)', stats_hash['full_transcriptome_length']],
325
+ [identation + 'Mean sequence length (bp)', '%.2f' % stats_hash['mean_length']],
326
+ [identation + 'Nucleotide indeterminations (bp)', stats_hash['indeterminations']],
327
+ [identation + 'Mean indetermination length (bp)', '%.2f' % stats_hash['indetermination_mean_length']]
328
+ ]
329
+ denominators = [
330
+ stats_hash['input_seqs'],
331
+ 0,
332
+ 0,
333
+ 0,
334
+ 0,
335
+ stats_hash['PRE_FLN_full_transcriptome_length'],
336
+ 0,
337
+ stats_hash['input_seqs'],
338
+ stats_hash['output_seqs'],
339
+ stats_hash['output_seqs'],
340
+ stats_hash['artifacts'],
341
+ stats_hash['artifacts'],
342
+ stats_hash['artifacts'],
343
+ stats_hash['artifacts'],
344
+ stats_hash['input_seqs'],
345
+ stats_hash['output_seqs'],
346
+ 0,
347
+ 0,
348
+ 0,
349
+ 0,
350
+ stats_hash['full_transcriptome_length'],
351
+ 0
352
+ ]
353
+ add_percentages_by_vector(general_report, 1, denominators)
354
+ general_report << ['BA index', "%5.2f" % [stats_hash['BA_index']], '-'] if stats_hash['BA_index'] > 0
355
+
356
+ # ASSEMBLY REPORT TABLE
357
+ #-------------------------------------------------------
358
+ without_orthologue = stats_hash['coding']+ stats_hash['unknown']
359
+ assembly_report = [
360
+ ['', 'Unigenes', '%'],
361
+ ['Unigenes', stats_hash['good_seqs']],
362
+ ['Unigenes >500pb', stats_hash['sequences_>500']],
363
+ ['Unigenes >200pb', stats_hash['sequences_>200']],
364
+ ['Longest unigene', stats_hash['longest_unigene']],
365
+ ['With orthologue <sup>1</sup>', stats_hash['prot_annotated']],
366
+ [identation + 'Different orthologue IDs', stats_hash['different_orthologues']],
367
+ [identation + 'Complete transcripts', stats_hash['complete']],
368
+ [identation + 'Different complete transcripts', stats_hash['different_completes']],
369
+ ['ncRNA', stats_hash['ncrna']],
370
+ ['Without orthologue <sup>1</sup>', without_orthologue],
371
+ [identation + 'Coding (all)', stats_hash['coding']],
372
+ [identation + 'Coding > 200bp', stats_hash['coding_>200']],
373
+ [identation + 'Coding > 500bp', stats_hash['coding_>500']],
374
+ [identation + 'Unknown (all)', stats_hash['unknown']],
375
+ [identation + 'Unknown > 200bp', stats_hash['unknown_>200']],
376
+ [identation + 'Unknown > 500bp', stats_hash['unknown_>500']]
377
+ ]
378
+ denominators = [
379
+ stats_hash['good_seqs'],
380
+ stats_hash['good_seqs'],
381
+ stats_hash['good_seqs'],
382
+ 0,
383
+ stats_hash['good_seqs'],
384
+ stats_hash['prot_annotated'],
385
+ stats_hash['prot_annotated'],
386
+ stats_hash['prot_annotated'],
387
+ stats_hash['good_seqs'],
388
+ stats_hash['good_seqs'],
389
+ without_orthologue,
390
+ without_orthologue,
391
+ without_orthologue,
392
+ without_orthologue,
393
+ without_orthologue,
394
+ without_orthologue
395
+ ]
396
+ add_percentages_by_vector(assembly_report, 1, denominators)
397
+
398
+ # STRUCTURAL PROFILE
399
+ #-------------------------------------------------------
400
+ structural_data = [
401
+ ['Category', 'Sure', 'Putative'],
402
+ ['Unknown', stats_hash['unknown'], 0],
403
+ ['Complete', stats_hash['complete_sure'], stats_hash['complete_putative']],
404
+ ['N-terminal', stats_hash['n_terminal_sure'], stats_hash['n_terminal_putative']],
405
+ ['C-terminal', stats_hash['c_terminal_sure'], stats_hash['c_terminal_putative']],
406
+ ['Internal', stats_hash['internal'], 0],
407
+ ['ncrna', stats_hash['ncrna'], 0],
408
+ ['Coding', stats_hash['coding'], stats_hash['coding_putative']]
409
+ ]
410
+ structural_data.each_with_index do |row, i|
411
+ row.each_with_index do |field, j|
412
+ structural_data[i][j] = field*100.0/stats_hash['good_seqs'] if i > 0 && j > 0 && structural_data[i][j] > 0
413
+ end
414
+ end
415
+
416
+ # STATUS REPORT
417
+ #----------------------------------------------------------
418
+ status_report = [
419
+ ['Status', 'colspan', 'Unigenes', '%'],
420
+ ['Complete', 'Sure', stats_hash['complete_sure']],
421
+ ['rowspan', 'Putative', stats_hash['complete_putative']],
422
+ ['C-terminus', 'Sure', stats_hash['c_terminal_sure']],
423
+ ['rowspan', 'Putative', stats_hash['c_terminal_putative']],
424
+ ['N-terminus', 'Sure', stats_hash['n_terminal_sure']],
425
+ ['rowspan', 'Putative', stats_hash['n_terminal_putative']],
426
+ ['Internal', 'colspan', stats_hash['internal']],
427
+ ['Coding', 'Sure', stats_hash['coding_sure']],
428
+ ['rowspan', 'Putative', stats_hash['coding_putative']],
429
+ ['ncRNA', 'colspan', stats_hash['ncrna']],
430
+ ['Unknown', 'colspan', stats_hash['unknown']],
431
+ ['Total', 'colspan', stats_hash['good_seqs']],
432
+ ]
433
+ add_percentages_by_scalar(status_report, 2, stats_hash['good_seqs'])
434
+
435
+ # TAXONOMY PROFILE
436
+ #-------------------------------------------------------
437
+ taxonomy = [
438
+ ['Organism', 'Annotations']
439
+ ].concat(stats_taxonomy.to_a.sort{|s2, s1| s1.last <=> s2.last}[0..20])
440
+
441
+ # TAXONOMY PROFILE
442
+ #-------------------------------------------------------
443
+ database_report = [
444
+ ['', 'Unigenes', '%'],
445
+ ['UserDB', stats_hash['userdb']],
446
+ ['SwissProt', stats_hash['swissprot']],
447
+ ['TrEMBL', stats_hash['trembl']],
448
+ ['ncRNA', stats_hash['ncrna']],
449
+ ['None', stats_hash['coding']+ stats_hash['unknown']],
450
+ ['Total', stats_hash['good_seqs']]
451
+ ]
452
+ add_percentages_by_scalar(database_report, 1, stats_hash['good_seqs'])
453
+
454
+ # GO ANNOTATION
455
+ #-------------------------------------------------------
456
+ container.merge!(go_for_graph(stats_functional_annotation_by_seqs))
457
+
458
+ # BUILD CONTAINER
459
+ #-------------------------------------------------------
460
+ container[:general_report] = general_report
461
+ container[:assembly_report] = assembly_report
462
+ container[:structural_data] = structural_data
463
+ container[:status_report] = status_report
464
+ container[:taxonomy] = taxonomy
465
+ container[:database_report] = database_report
466
+ return container
467
+ end
468
+
469
+
470
+ def handle_data_reptrans_summary(stats_hash)
471
+ # GENERAL REPORT
472
+ #-------------------------------------------------------
473
+ all_seqs = 0
474
+ stats_hash.values.map{|v| all_seqs += v}
475
+ general_report = [
476
+ ['', 'Sequences', '%'],
477
+ ['Output', all_seqs],
478
+ ['Annotated with protein', stats_hash['prot_annotated']],
479
+ ['Annotated with EST', stats_hash['est_annotated']],
480
+ ['Coding test-code > 1', stats_hash['coding_>1']],
481
+ ['Coding test-code > 0.94', stats_hash['coding_>0.94']],
482
+ ['Coding test-code > 0.84', stats_hash['coding_>0.84']],
483
+ ['Coding test-code > 0.73', stats_hash['coding_>0.73']],
484
+ ['Coding test-code > 0', stats_hash['coding_>0']]
485
+ ]
486
+ add_percentages_by_scalar(general_report, 1, all_seqs)
487
+
488
+ # ACUMULATIVE REPORT
489
+ #-------------------------------------------------------
490
+ categories = [
491
+ 'Annotated with protein',
492
+ 'Annotated with EST',
493
+ 'Coding test-code > 1',
494
+ 'Coding test-code > 0.94',
495
+ 'Coding test-code > 0.84',
496
+ 'Coding test-code > 0.73',
497
+ 'Coding test-code > 0'
498
+ ]
499
+ values = [
500
+ stats_hash['prot_annotated'],
501
+ stats_hash['est_annotated'],
502
+ stats_hash['coding_>1'],
503
+ stats_hash['coding_>0.94'],
504
+ stats_hash['coding_>0.84'],
505
+ stats_hash['coding_>0.73'],
506
+ stats_hash['coding_>0']
507
+ ]
508
+ acumulative = []
509
+ acumulative << values.inject(0) { |result, element|
510
+ acumulative << result if result > 0
511
+ result + element
512
+ }
513
+ report = []
514
+ categories.each_with_index do |cat, i|
515
+ report << [cat, acumulative[i]]
516
+ end
517
+ acumulative_report = [
518
+ ['', 'Sequences', '%'],
519
+ ].concat(report)
520
+ add_percentages_by_scalar(acumulative_report, 1, all_seqs)
521
+
522
+ # BUILD CONTAINER
523
+ #-------------------------------------------------------
524
+ container = {}
525
+ container[:general_report] = general_report
526
+ container[:acumulative_report] = acumulative_report
527
+ return container
528
+ end
529
+
530
+ def add_percentages_by_vector(table, col, denominators)
531
+ table.each_with_index do |row, i|
532
+ next if i == 0 #Skip header
533
+ den = denominators[i-1]
534
+ perc = row[col]*100.0/denominators[i-1] if den > 0
535
+ if den > 0 && !perc.nan? && (perc).infinite?.nil?
536
+ percentage = '%.2f' % perc.to_s
537
+ percentage += '%'
538
+ else
539
+ percentage ='-'
540
+ end
541
+ row << percentage
542
+ end
543
+ end
544
+
545
+ def add_percentages_by_scalar(table, col, denominator)
546
+ table.each_with_index do |row, i|
547
+ next if i == 0 #Skip header
548
+ perc = row[col]*100.0/denominator
549
+ if !perc.nan? && perc.infinite?.nil?
550
+ percentage = '%.2f' % perc.to_s
551
+ percentage += '%'
552
+ else
553
+ percentage ='-'
554
+ end
555
+ row << percentage
556
+ end
557
+ end
558
+
559
+ def write_summary_stats(stats_hash, stats_taxonomy, stats_functional_annotation_by_seqs, diff_ids_array, diff_ids_complete_array, pre_fln_seq_lengths, seq_lengths, txt_file, html_file)
560
+ stats_hash = last_stats(stats_hash, diff_ids_array, diff_ids_complete_array, pre_fln_seq_lengths, seq_lengths)
561
+ write_txt(stats_hash, txt_file)
562
+ container = handle_data_main_summary(stats_hash, stats_taxonomy, stats_functional_annotation_by_seqs)
563
+ template = File.open(File.join(REPORT_FOLDER, 'general_summary.erb')).read
564
+ report = Report_html.new(container, 'FLN Summary')
565
+ report.build(template)
566
+ report.write(html_file)
567
+ end
568
+
569
+ def write_mapping_report(fpkm, coverage_analysis, stats_functional_annotation_by_seqs)
570
+ if !fpkm.empty? && !coverage_analysis.empty? # REPORT Mapping
571
+ container = go_for_graph(stats_functional_annotation_by_seqs, fpkm)
572
+ measured_coverages = coverage_analysis.values.map{|c| [c[1], c[2]]}
573
+ measured_coverages.sort!{|c1, c2| c2[1] <=> c1[1]}
574
+ measured_coverages.each_with_index do |cov, i|
575
+ cov.unshift(i+1) # Puts x axis: 1, 2, 3 ... (seqs)
576
+ end
577
+ measured_coverages.unshift(%w[transcripts mean_10max mean])
578
+ container[:mean_coverage] = measured_coverages
579
+ count = 0
580
+ container[:max10_coverage] = coverage_analysis.values.sort{|c1, c2| c2[1] <=> c1[1]}.map{|c| count += 1; [count, c[1]]}
581
+ container[:normalized_partial_coverage] = coverage_analysis.values.map{|c| [c[3], c[0]] }
582
+ mean_cov_trasn_cov = coverage_analysis.values.map{|data| [data[3], data[2]]}
583
+ mean_cov_trasn_cov.sort!{|i1, i2| i1[0] <=> i2[0]}
584
+ mean_cov_trasn_cov.unshift(%w[trans_cov mean_coverage])
585
+ container[:normalized_coverages_sorted_by_npc] = mean_cov_trasn_cov
586
+ template = File.open(File.join(REPORT_FOLDER, 'mapping_summary.erb')).read
587
+ report = Report_html.new(container, 'FLN Summary')
588
+ report.build(template)
589
+ report.write(File.join('fln_results', 'mapping_summary.html'))
590
+ end
591
+ end
592
+
593
+ def write_reptrans_stats(stats_hash, html_file, txt_file)
594
+ txt = File.open(txt_file,'w')
595
+ write_txt(stats_hash, txt)
596
+ container = handle_data_reptrans_summary(stats_hash)
597
+ template = File.open(File.join(REPORT_FOLDER, 'reptrans_summary.erb')).read
598
+ report = Report_html.new(container, 'FLN Reptrans Summary')
599
+ report.build(template)
600
+ report.write(html_file)
601
+ end
602
+
603
+ def write_txt(stats_hash, file)
604
+ stats_hash.each do |key, value|
605
+ file.puts "#{value}\t#{key}"
606
+ end
607
+ end
608
+
609
+ def table_title(title)
610
+ html = '<div style="font-size:25px; margin: 10"><b>'+title+'</b></div>'
611
+ return html
612
+ end
613
+ end