full_lengther_next 0.6.2 → 0.9.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (59) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +9 -0
  3. data/.rspec +2 -0
  4. data/.travis.yml +5 -0
  5. data/CODE_OF_CONDUCT.md +49 -0
  6. data/Gemfile +4 -0
  7. data/LICENSE.txt +21 -0
  8. data/{README.rdoc → README.md} +0 -0
  9. data/Rakefile +6 -37
  10. data/bin/console +14 -0
  11. data/bin/download_fln_dbs.rb +2 -7
  12. data/bin/full_lengther_next +85 -6
  13. data/bin/make_user_db.rb +13 -5
  14. data/bin/setup +8 -0
  15. data/full_lengther_next.gemspec +42 -0
  16. data/lib/full_lengther_next.rb +2 -10
  17. data/lib/full_lengther_next/artifacts.rb +74 -0
  18. data/lib/full_lengther_next/{classes/blast_functions.rb → blast_functions.rb} +0 -0
  19. data/lib/full_lengther_next/{classes/cdhit.rb → cdhit.rb} +0 -0
  20. data/lib/full_lengther_next/{classes/chimeric_seqs.rb → chimeric_seqs.rb} +0 -0
  21. data/lib/full_lengther_next/{classes/common_functions.rb → common_functions.rb} +0 -0
  22. data/lib/full_lengther_next/{classes/exonerate_result.rb → exonerate_result.rb} +0 -0
  23. data/lib/full_lengther_next/{classes/fl_analysis.rb → fl_analysis.rb} +0 -0
  24. data/lib/full_lengther_next/{classes/fl_string_utils.rb → fl_string_utils.rb} +0 -0
  25. data/lib/full_lengther_next/fln_stats.rb +613 -0
  26. data/lib/full_lengther_next/go_methods.rb +42 -0
  27. data/lib/full_lengther_next/{classes/handle_db.rb → handle_db.rb} +0 -0
  28. data/lib/full_lengther_next/mapping.rb +296 -0
  29. data/lib/full_lengther_next/{classes/my_worker.rb → my_worker.rb} +71 -9
  30. data/lib/full_lengther_next/{classes/my_worker_EST.rb → my_worker_EST.rb} +0 -0
  31. data/lib/full_lengther_next/{classes/my_worker_manager_EST.rb → my_worker_manager_EST.rb} +0 -0
  32. data/lib/full_lengther_next/{classes/my_worker_manager_fln.rb → my_worker_manager_fln.rb} +181 -16
  33. data/lib/full_lengther_next/{classes/nc_rna.rb → nc_rna.rb} +0 -0
  34. data/lib/full_lengther_next/{classes/orf.rb → orf.rb} +0 -0
  35. data/lib/full_lengther_next/{classes/reptrans.rb → reptrans.rb} +9 -5
  36. data/lib/full_lengther_next/{classes/sequence.rb → sequence.rb} +26 -1
  37. data/lib/full_lengther_next/{classes/test_code.rb → test_code.rb} +1 -1
  38. data/lib/full_lengther_next/{classes/types.rb → types.rb} +3 -2
  39. data/lib/full_lengther_next/{classes/une_los_hit.rb → une_los_hit.rb} +0 -0
  40. data/lib/full_lengther_next/version.rb +3 -0
  41. data/lib/full_lengther_next/{classes/warnings.rb → warnings.rb} +0 -0
  42. data/report_templates/general_summary.erb +140 -0
  43. data/report_templates/mapping_summary.erb +98 -0
  44. data/report_templates/reptrans_summary.erb +32 -0
  45. metadata +112 -134
  46. data/.gemtest +0 -0
  47. data/History.txt +0 -32
  48. data/Manifest.txt +0 -44
  49. data/PostInstall.txt +0 -6
  50. data/bin/plot_fln.rb +0 -270
  51. data/bin/plot_taxonomy.rb +0 -70
  52. data/lib/expresscanvas.zip +0 -0
  53. data/lib/full_lengther_next/classes/artifacts.rb +0 -66
  54. data/lib/full_lengther_next/classes/fln_stats.rb +0 -641
  55. data/script/console +0 -10
  56. data/script/destroy +0 -14
  57. data/script/generate +0 -14
  58. data/test/test_full_lengther_next.rb +0 -11
  59. data/test/test_helper.rb +0 -3
@@ -1,13 +1,5 @@
1
- $:.unshift(File.dirname(__FILE__)) unless
2
- $:.include?(File.dirname(__FILE__)) || $:.include?(File.expand_path(File.dirname(__FILE__)))
3
-
4
- # ROOT_PATH=File.join(File.dirname(__FILE__),'full_lengther_next')
5
-
6
- $: << File.expand_path(File.join(File.dirname(__FILE__), 'full_lengther_next', 'classes'))
7
-
1
+ require "full_lengther_next/version"
8
2
 
9
3
  module FullLengtherNext
10
- VERSION = '0.6.2'
11
-
12
- FULL_LENGHTER_VERSION = VERSION
4
+ # Your code goes here...
13
5
  end
@@ -0,0 +1,74 @@
1
+ require 'blast_functions'
2
+ require 'types'
3
+ require 'chimeric_seqs'
4
+ include ChimericSeqs
5
+
6
+ #####################################################################
7
+ ## MAIN FUNCTION
8
+ #####################################################################
9
+ def artifact?(seq, query, db_name, db_path, options, new_seqs)
10
+ artifact = FALSE
11
+ # UNMAPPED CONTIG DETECTION
12
+ if query.nil? && seq.unmapped? #If seq is misassembled stop chimera analisys
13
+ seq.hit = nil
14
+ artifact = TRUE
15
+ seq.type = UNMAPPED
16
+ end
17
+
18
+ if !query.nil?
19
+ # MISASSEMBLED DETECTION
20
+ if !artifact && misassembled_detection(query) #If seq is misassembled stop chimera analisys
21
+ seq.hit = query.hits.first
22
+ artifact = TRUE
23
+ seq.type = MISASSEMBLED
24
+ seq.warnings('ERROR#1')
25
+ end
26
+
27
+ # OVERLAPPING HSPS ON SUBJECT DETECTION
28
+ =begin
29
+ if !artifact
30
+ hit_reference = query.hits.first.dup
31
+ query, overlapping = overlapping_hsps_on_subject(query)
32
+ if overlapping
33
+ if query.hits.first.nil?
34
+ seq.hit = hit_reference
35
+ else
36
+ seq.hit = query.hits.first
37
+ end
38
+ artifact = TRUE
39
+ seq.type = OTHER
40
+ seq.warnings('ERROR#2')
41
+ end
42
+ end
43
+ =end
44
+
45
+ # MULTIPLE HSP DETECTION
46
+ if !artifact && multiple_hsps(query, 3)
47
+ seq.hit = query.hits.first
48
+ seq.warnings('ERROR#3')
49
+ end
50
+
51
+ # CHIMERA DETECTION
52
+ if !artifact && !options[:chimera].include?('d')
53
+ chimera = search_chimeras(seq, query, options, db_name, db_path)
54
+ if !chimera.nil?
55
+ new_seqs.concat(chimera)
56
+ seq.db_name = db_name
57
+ seq.type = CHIMERA
58
+ artifact = TRUE
59
+ end
60
+ end
61
+ end
62
+ if artifact
63
+ if $verbose > 1
64
+ puts seq.prot_annot_calification
65
+ end
66
+ seq.db_name = db_name
67
+ seq.save_fasta = FALSE
68
+ seq.ignore = TRUE
69
+ end
70
+ return artifact
71
+ end
72
+
73
+
74
+
@@ -0,0 +1,613 @@
1
+ require 'report_html'
2
+ require 'types.rb'
3
+ require 'go_methods'
4
+
5
+ module FlnStats
6
+ REPORT_FOLDER = File.expand_path(File.join(File.dirname(__FILE__), '..', '..', 'report_templates'))
7
+ def initialize_stats_hash
8
+ stats_hash = {
9
+ 'input_seqs' => 0,
10
+ 'output_seqs' => 0,
11
+ 'failed' => 0,
12
+ 'full_transcriptome_length' => 0,
13
+ 'PRE_FLN_full_transcriptome_length' => 0,
14
+ 'mean_length' => 0,
15
+ 'PRE_FLN_mean_length' => 0,
16
+ 'indeterminations' => 0,
17
+ 'PRE_FLN_indeterminations' => 0,
18
+ 'gap_number' => 0,
19
+ 'PRE_FLN_gap_number' => 0,
20
+ 'indetermination_mean_length' => 0,
21
+ 'PRE_FLN_indetermination_mean_length' => 0,
22
+ 'sequences_>200' => 0,
23
+ 'sequences_>500' => 0,
24
+ 'PRE_FLN_sequences_>500' => 0,
25
+ 'longest_unigene' => 0,
26
+ 'n50' => 0,
27
+ 'PRE_FLN_n50' => 0,
28
+ 'n90' => 0,
29
+ 'PRE_FLN_n90' => 0,
30
+ 'good_seqs' => 0,
31
+ 'artifacts' => 0,
32
+ 'misassembled' => 0,
33
+ 'chimeras' => 0,
34
+ 'unmapped' => 0,
35
+ 'other_artifacts' => 0,
36
+ 'unknown' => 0,
37
+ 'unknown_>200' => 0,
38
+ 'unknown_>500' => 0,
39
+ 'prot_annotated' => 0,
40
+ 'complete' => 0,
41
+ 'complete_sure' => 0,
42
+ 'complete_putative' => 0,
43
+ 'n_terminal' => 0,
44
+ 'n_terminal_sure' => 0,
45
+ 'n_terminal_putative' => 0,
46
+ 'c_terminal' => 0,
47
+ 'c_terminal_sure' => 0,
48
+ 'c_terminal_putative' => 0,
49
+ 'internal' => 0,
50
+ 'swissprot' => 0,
51
+ 'trembl' => 0,
52
+ 'userdb' => 0,
53
+ 'ncrna' => 0,
54
+ 'coding' => 0,
55
+ 'coding_sure' => 0,
56
+ 'coding_putative' => 0,
57
+ 'coding_>200' => 0,
58
+ 'coding_>500' => 0,
59
+ 'different_orthologues' => 0,
60
+ 'different_completes' => 0,
61
+ 'BA_index' => 0
62
+ }
63
+
64
+ return stats_hash
65
+ end
66
+
67
+ def get_taxonomy(name, taxonomy)
68
+ organism = nil
69
+ if name.include?('OS=')
70
+ fields = name.split('OS=',2)
71
+ organism = fields.last.split(' GN=').first.strip
72
+ elsif name[0..2] = 'sp=' || name[0..2] = 'tr='
73
+ name =~ /(\w+ \w+) \(([\w ]+)\) \(([\w ]+)\)/
74
+ if !$1.nil?
75
+ organism = $1
76
+ else
77
+ name =~ /(\w+ \w+) \(([\w ]+)\)/
78
+ if !$1.nil?
79
+ organism = $1
80
+ end
81
+ end
82
+ else
83
+ organism = name.split(";",2).last
84
+ organism = organism.split('.', 2).first
85
+ organism.gsub!(/\(\D+\)/,'')
86
+ if organism.split(' ').length > 1
87
+ organism.gsub!('.','')
88
+ organism.gsub!(/^ /,'')
89
+ organism.gsub!(' ','')
90
+ organism.strip!
91
+ end
92
+ end
93
+ if !organism.nil?
94
+ organism = organism.split(' ')[0..1].join(' ')
95
+ if taxonomy[organism].nil?
96
+ taxonomy[organism] = 1
97
+ else
98
+ taxonomy[organism] += 1
99
+ end
100
+ end
101
+ end
102
+
103
+ def initialize_stats_hash_reptrans
104
+ stats_hash = {
105
+ 'prot_annotated' => 0,
106
+ 'est_annotated' => 0,
107
+ 'coding_>1' => 0,
108
+ 'coding_>0.94' => 0,
109
+ 'coding_>0.84' => 0,
110
+ 'coding_>0.73' => 0,
111
+ 'coding_>0' => 0
112
+ }
113
+ return stats_hash
114
+ end
115
+
116
+ # Extract sequence stats
117
+ ##################################################
118
+ def sequence_stats(seq, stats_hash)
119
+ nt_seq = seq.seq_fasta
120
+ stats_hash['input_seqs'] += 1
121
+ stats_hash['PRE_FLN_sequences_>500'] += 1 if nt_seq.length >= 500
122
+ stats_hash['PRE_FLN_full_transcriptome_length'] += nt_seq.length
123
+ stats_hash['PRE_FLN_indeterminations'] += (nt_seq.count('n') + nt_seq.count('N'))
124
+ stats_hash['PRE_FLN_gap_number'] += nt_seq.scan(/[nN]+/).length
125
+ end
126
+
127
+ # Build final stats
128
+ ####################################################
129
+ def summary_stats(seqs, stats_hash, diff_ids_array, diff_ids_complete_array, all_seq_lengths)
130
+ low_limit = 200
131
+ upper_limit = 500
132
+ #All seqs
133
+ #-----------
134
+ stats_hash['output_seqs'] += seqs.length
135
+ good_seqs = seqs.select{|s| s.type >= UNKNOWN}
136
+ stats_hash['good_seqs'] += good_seqs.length
137
+
138
+ #Indeterminations
139
+ if !good_seqs.empty?
140
+ stats_hash['indeterminations'] += good_seqs.map{|s| s.seq_fasta.count('n') + s.seq_fasta.count('N')}.inject { |sum, n| sum + n }
141
+ stats_hash['gap_number'] += good_seqs.map{|s| s.seq_fasta.scan(/[nN]+/).length}.inject { |sum, n| sum + n }
142
+ end
143
+
144
+ #Longest_unigene
145
+ current_longest_unigene = seqs.map{|s| s.fasta_length}.max
146
+ if current_longest_unigene > stats_hash['longest_unigene']
147
+ stats_hash['longest_unigene'] = current_longest_unigene
148
+ end
149
+
150
+ #Load ids
151
+ seqs.map{|s|
152
+ if s.type > UNKNOWN && s.type < NCRNA
153
+ diff_ids_array << s.hit.acc
154
+ end}
155
+ diff_ids_array.uniq!
156
+
157
+ #By Length
158
+ if !good_seqs.empty?
159
+ seq_lengths = good_seqs.map{|s| s.fasta_length }
160
+ all_seq_lengths.concat(seq_lengths)
161
+ stats_hash['full_transcriptome_length'] += seq_lengths.inject { |sum, n| sum + n }
162
+ stats_hash['sequences_>200'] += seq_lengths.select{|l| l > low_limit}.length
163
+ stats_hash['sequences_>500'] += seq_lengths.select{|l| l > upper_limit}.length
164
+ end
165
+
166
+ stats_hash['failed'] += seqs.select{|s| s.type == FAILED}.length
167
+
168
+ #Unknown
169
+ #-----------------------------
170
+ all_unknown = seqs.select{|s| s.type == UNKNOWN}
171
+ stats_hash['unknown'] += all_unknown.length
172
+
173
+ #By Length
174
+ stats_hash['unknown_>200'] += all_unknown.select{|s| s.fasta_length > low_limit}.length
175
+ stats_hash['unknown_>500'] += all_unknown.select{|s| s.fasta_length > upper_limit}.length
176
+
177
+ #Artifacts
178
+ #----------------
179
+ stats_hash['artifacts'] += seqs.select{|s| s.type < UNKNOWN && s.type > FAILED}.length
180
+ stats_hash['misassembled'] += seqs.select{|s| s.type == MISASSEMBLED}.length
181
+ stats_hash['unmapped'] += seqs.select{|s| s.type == UNMAPPED}.length
182
+ stats_hash['chimeras'] += seqs.select{|s| s.type == CHIMERA && !s.seq_name.include?('_split_')}.length # We don't want count a multiple chimera
183
+ stats_hash['other_artifacts'] += seqs.select{|s| s.type == OTHER}.length
184
+
185
+ #Annotated with prot
186
+ #---------------------
187
+ prot_annotated = seqs.select{|s| s.type >= COMPLETE && s.type <= INTERNAL}
188
+ stats_hash['prot_annotated'] += prot_annotated.length
189
+
190
+ #By annotation
191
+ stats_hash['internal'] += seqs.select{|s| s.type == INTERNAL}.length
192
+ complete = seqs.select{|s| s.type == COMPLETE}
193
+ n_terminal = seqs.select{|s| s.type == N_TERMINAL}
194
+ c_terminal = seqs.select{|s| s.type == C_TERMINAL}
195
+
196
+ stats_hash['complete'] += complete.length
197
+ stats_hash['n_terminal'] += n_terminal.length
198
+ stats_hash['c_terminal'] += c_terminal.length
199
+
200
+ #Load complete ids
201
+ complete.map{|s| diff_ids_complete_array << s.hit.acc}
202
+ diff_ids_complete_array.uniq!
203
+
204
+ #----> By Status
205
+ stats_hash['complete_sure'] += complete.select{|s| s.status}.length
206
+ stats_hash['n_terminal_sure'] += n_terminal.select{|s| s.status}.length
207
+ stats_hash['c_terminal_sure'] += c_terminal.select{|s| s.status}.length
208
+ stats_hash['complete_putative'] += complete.select{|s| !s.status}.length
209
+ stats_hash['n_terminal_putative'] += n_terminal.select{|s| !s.status}.length
210
+ stats_hash['c_terminal_putative'] += c_terminal.select{|s| !s.status}.length
211
+
212
+ #By database
213
+ swissprot = prot_annotated.select{|s| s.db_name =~ /^sp_/}.length
214
+ trembl = prot_annotated.select{|s| s.db_name =~ /^tr_/}.length
215
+ stats_hash['swissprot'] += swissprot
216
+ stats_hash['trembl'] += trembl
217
+ stats_hash['userdb'] += prot_annotated.length - swissprot - trembl
218
+
219
+ #ncRNA
220
+ #----------------
221
+ stats_hash['ncrna'] += seqs.select{|s| s.type == NCRNA}.length
222
+
223
+ #Coding sequences
224
+ #----------------
225
+ coding = seqs.select{|s| s.type == CODING}
226
+ stats_hash['coding'] += coding.length
227
+
228
+ #By Status
229
+ stats_hash['coding_sure'] += coding.select{|s| s.status}.length
230
+ stats_hash['coding_putative'] += coding.select{|s| !s.status}.length
231
+
232
+ #By Length
233
+ stats_hash['coding_>200'] += coding.select{|s| s.fasta_length > low_limit}.length
234
+ stats_hash['coding_>500'] += coding.select{|s| s.fasta_length > upper_limit}.length
235
+
236
+
237
+ return stats_hash, diff_ids_array, diff_ids_complete_array, all_seq_lengths
238
+ end
239
+
240
+ def calculate_n50_n90(stats_hash, f_tot_key, n50_key, n90_key, seq_lengths)
241
+ f_tot_lengths = stats_hash[f_tot_key].to_f
242
+ cum = 0
243
+ seq_lengths.sort!{|a, b| b <=> a}
244
+ seq_lengths.each do |length|
245
+ cum += length
246
+ if cum / f_tot_lengths > 0.5 && stats_hash[n50_key] == 0
247
+ stats_hash[n50_key] = length
248
+ elsif cum / f_tot_lengths > 0.9
249
+ stats_hash[n90_key] = length
250
+ break
251
+ end
252
+ end
253
+ end
254
+
255
+ def last_stats(stats_hash, diff_ids_array, diff_ids_complete_array, pre_fln_seq_lengths, seq_lengths)
256
+ stats_hash['different_orthologues'] = diff_ids_array.length
257
+ stats_hash['different_completes'] = diff_ids_complete_array.length
258
+ stats_hash['mean_length'] = stats_hash['full_transcriptome_length'].to_f / stats_hash['good_seqs'] if stats_hash['good_seqs'] > 0
259
+ stats_hash['indetermination_mean_length'] = stats_hash['indeterminations'].to_f / stats_hash['gap_number'] if stats_hash['gap_number'] > 0
260
+ stats_hash['PRE_FLN_mean_length'] = stats_hash['PRE_FLN_full_transcriptome_length'].to_f / stats_hash['input_seqs'] if stats_hash['input_seqs'] > 0
261
+ stats_hash['PRE_FLN_indetermination_mean_length'] = stats_hash['PRE_FLN_indeterminations'].to_f / stats_hash['PRE_FLN_gap_number'] if stats_hash['PRE_FLN_gap_number'] > 0
262
+
263
+ calculate_n50_n90(stats_hash, 'full_transcriptome_length', 'n50', 'n90', seq_lengths)
264
+ calculate_n50_n90(stats_hash, 'PRE_FLN_full_transcriptome_length', 'PRE_FLN_n50', 'PRE_FLN_n90', pre_fln_seq_lengths)
265
+
266
+ #BA index
267
+ if stats_hash['prot_annotated'] > 0 &&
268
+ stats_hash['complete'] > 0 &&
269
+ stats_hash['sequences_>500'] > 0 &&
270
+ stats_hash['different_orthologues'] > 0 &&
271
+ stats_hash['different_completes'] > 0
272
+ coef_anot_geom = (stats_hash['prot_annotated'] * stats_hash['complete'] * 1.0)/(stats_hash['sequences_>500']*10000)
273
+ coef_mejora = (stats_hash['different_orthologues']*1.0 + stats_hash['different_completes'])/(stats_hash['prot_annotated'] + stats_hash['complete'])
274
+ stats_hash['BA_index'] = Math.sqrt(coef_anot_geom*coef_mejora)
275
+ end
276
+
277
+ return stats_hash
278
+ end
279
+
280
+ def coding_stats_reptrans(coding_seq, stats_hash)
281
+ group = nil
282
+ if coding_seq.t_code > 1
283
+ group = 'coding_>1'
284
+ elsif coding_seq.t_code > 0.95
285
+ group = 'coding_>0.94'
286
+ elsif coding_seq.t_code > 0.85
287
+ group = 'coding_>0.84'
288
+ elsif coding_seq.t_code > 0.73
289
+ group = 'coding_>0.73'
290
+ elsif coding_seq.t_code > 0
291
+ group = 'coding_>0'
292
+ end
293
+ if !group.nil?
294
+ stats_hash[group] += 1
295
+ end
296
+ end
297
+
298
+ def handle_data_main_summary(stats_hash, stats_taxonomy, stats_functional_annotation_by_seqs)
299
+ container = {}
300
+
301
+ identation = '&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;'
302
+ # GENERAL REPORT TABLE
303
+ #-------------------------------------------------------
304
+ general_report = [
305
+ ['', 'Sequences', '%'],
306
+ ['Input', stats_hash['input_seqs']],
307
+ [identation + 'N50 (bp)', stats_hash['PRE_FLN_n50']],
308
+ [identation + 'N90 (bp)', stats_hash['PRE_FLN_n90']],
309
+ [identation + 'Full transcriptome length (bp)', stats_hash['PRE_FLN_full_transcriptome_length']],
310
+ [identation + 'Mean sequence length (bp)', '%.2f' % stats_hash['PRE_FLN_mean_length']],
311
+ [identation + 'Nucleotide indeterminations (bp)', stats_hash['PRE_FLN_indeterminations']],
312
+ [identation + 'Mean indetermination length (bp)', '%.2f' % stats_hash['PRE_FLN_indetermination_mean_length']],
313
+ [identation + 'Unigenes >500pb', stats_hash['PRE_FLN_sequences_>500']],
314
+ [identation + 'Failing sequences', stats_hash['failed']],
315
+ [identation + 'Artifacts <sup>1</sup>', stats_hash['artifacts']],
316
+ [identation*2 + 'Unmapped transcripts', stats_hash['unmapped']],
317
+ [identation*2 + 'Misassembled', stats_hash['misassembled']],
318
+ [identation*2 + 'Chimeras', stats_hash['chimeras']],
319
+ [identation*2 + 'Other', stats_hash['other_artifacts']],
320
+ ['Sequences with resolved chimeras', stats_hash['output_seqs']],
321
+ ['Sequences without artifacts', stats_hash['good_seqs']],
322
+ [identation + 'N50 (bp)', stats_hash['n50']],
323
+ [identation + 'N90 (bp)', stats_hash['n90']],
324
+ [identation + 'Full transcriptome length (bp)', stats_hash['full_transcriptome_length']],
325
+ [identation + 'Mean sequence length (bp)', '%.2f' % stats_hash['mean_length']],
326
+ [identation + 'Nucleotide indeterminations (bp)', stats_hash['indeterminations']],
327
+ [identation + 'Mean indetermination length (bp)', '%.2f' % stats_hash['indetermination_mean_length']]
328
+ ]
329
+ denominators = [
330
+ stats_hash['input_seqs'],
331
+ 0,
332
+ 0,
333
+ 0,
334
+ 0,
335
+ stats_hash['PRE_FLN_full_transcriptome_length'],
336
+ 0,
337
+ stats_hash['input_seqs'],
338
+ stats_hash['output_seqs'],
339
+ stats_hash['output_seqs'],
340
+ stats_hash['artifacts'],
341
+ stats_hash['artifacts'],
342
+ stats_hash['artifacts'],
343
+ stats_hash['artifacts'],
344
+ stats_hash['input_seqs'],
345
+ stats_hash['output_seqs'],
346
+ 0,
347
+ 0,
348
+ 0,
349
+ 0,
350
+ stats_hash['full_transcriptome_length'],
351
+ 0
352
+ ]
353
+ add_percentages_by_vector(general_report, 1, denominators)
354
+ general_report << ['BA index', "%5.2f" % [stats_hash['BA_index']], '-'] if stats_hash['BA_index'] > 0
355
+
356
+ # ASSEMBLY REPORT TABLE
357
+ #-------------------------------------------------------
358
+ without_orthologue = stats_hash['coding']+ stats_hash['unknown']
359
+ assembly_report = [
360
+ ['', 'Unigenes', '%'],
361
+ ['Unigenes', stats_hash['good_seqs']],
362
+ ['Unigenes >500pb', stats_hash['sequences_>500']],
363
+ ['Unigenes >200pb', stats_hash['sequences_>200']],
364
+ ['Longest unigene', stats_hash['longest_unigene']],
365
+ ['With orthologue <sup>1</sup>', stats_hash['prot_annotated']],
366
+ [identation + 'Different orthologue IDs', stats_hash['different_orthologues']],
367
+ [identation + 'Complete transcripts', stats_hash['complete']],
368
+ [identation + 'Different complete transcripts', stats_hash['different_completes']],
369
+ ['ncRNA', stats_hash['ncrna']],
370
+ ['Without orthologue <sup>1</sup>', without_orthologue],
371
+ [identation + 'Coding (all)', stats_hash['coding']],
372
+ [identation + 'Coding > 200bp', stats_hash['coding_>200']],
373
+ [identation + 'Coding > 500bp', stats_hash['coding_>500']],
374
+ [identation + 'Unknown (all)', stats_hash['unknown']],
375
+ [identation + 'Unknown > 200bp', stats_hash['unknown_>200']],
376
+ [identation + 'Unknown > 500bp', stats_hash['unknown_>500']]
377
+ ]
378
+ denominators = [
379
+ stats_hash['good_seqs'],
380
+ stats_hash['good_seqs'],
381
+ stats_hash['good_seqs'],
382
+ 0,
383
+ stats_hash['good_seqs'],
384
+ stats_hash['prot_annotated'],
385
+ stats_hash['prot_annotated'],
386
+ stats_hash['prot_annotated'],
387
+ stats_hash['good_seqs'],
388
+ stats_hash['good_seqs'],
389
+ without_orthologue,
390
+ without_orthologue,
391
+ without_orthologue,
392
+ without_orthologue,
393
+ without_orthologue,
394
+ without_orthologue
395
+ ]
396
+ add_percentages_by_vector(assembly_report, 1, denominators)
397
+
398
+ # STRUCTURAL PROFILE
399
+ #-------------------------------------------------------
400
+ structural_data = [
401
+ ['Category', 'Sure', 'Putative'],
402
+ ['Unknown', stats_hash['unknown'], 0],
403
+ ['Complete', stats_hash['complete_sure'], stats_hash['complete_putative']],
404
+ ['N-terminal', stats_hash['n_terminal_sure'], stats_hash['n_terminal_putative']],
405
+ ['C-terminal', stats_hash['c_terminal_sure'], stats_hash['c_terminal_putative']],
406
+ ['Internal', stats_hash['internal'], 0],
407
+ ['ncrna', stats_hash['ncrna'], 0],
408
+ ['Coding', stats_hash['coding'], stats_hash['coding_putative']]
409
+ ]
410
+ structural_data.each_with_index do |row, i|
411
+ row.each_with_index do |field, j|
412
+ structural_data[i][j] = field*100.0/stats_hash['good_seqs'] if i > 0 && j > 0 && structural_data[i][j] > 0
413
+ end
414
+ end
415
+
416
+ # STATUS REPORT
417
+ #----------------------------------------------------------
418
+ status_report = [
419
+ ['Status', 'colspan', 'Unigenes', '%'],
420
+ ['Complete', 'Sure', stats_hash['complete_sure']],
421
+ ['rowspan', 'Putative', stats_hash['complete_putative']],
422
+ ['C-terminus', 'Sure', stats_hash['c_terminal_sure']],
423
+ ['rowspan', 'Putative', stats_hash['c_terminal_putative']],
424
+ ['N-terminus', 'Sure', stats_hash['n_terminal_sure']],
425
+ ['rowspan', 'Putative', stats_hash['n_terminal_putative']],
426
+ ['Internal', 'colspan', stats_hash['internal']],
427
+ ['Coding', 'Sure', stats_hash['coding_sure']],
428
+ ['rowspan', 'Putative', stats_hash['coding_putative']],
429
+ ['ncRNA', 'colspan', stats_hash['ncrna']],
430
+ ['Unknown', 'colspan', stats_hash['unknown']],
431
+ ['Total', 'colspan', stats_hash['good_seqs']],
432
+ ]
433
+ add_percentages_by_scalar(status_report, 2, stats_hash['good_seqs'])
434
+
435
+ # TAXONOMY PROFILE
436
+ #-------------------------------------------------------
437
+ taxonomy = [
438
+ ['Organism', 'Annotations']
439
+ ].concat(stats_taxonomy.to_a.sort{|s2, s1| s1.last <=> s2.last}[0..20])
440
+
441
+ # TAXONOMY PROFILE
442
+ #-------------------------------------------------------
443
+ database_report = [
444
+ ['', 'Unigenes', '%'],
445
+ ['UserDB', stats_hash['userdb']],
446
+ ['SwissProt', stats_hash['swissprot']],
447
+ ['TrEMBL', stats_hash['trembl']],
448
+ ['ncRNA', stats_hash['ncrna']],
449
+ ['None', stats_hash['coding']+ stats_hash['unknown']],
450
+ ['Total', stats_hash['good_seqs']]
451
+ ]
452
+ add_percentages_by_scalar(database_report, 1, stats_hash['good_seqs'])
453
+
454
+ # GO ANNOTATION
455
+ #-------------------------------------------------------
456
+ container.merge!(go_for_graph(stats_functional_annotation_by_seqs))
457
+
458
+ # BUILD CONTAINER
459
+ #-------------------------------------------------------
460
+ container[:general_report] = general_report
461
+ container[:assembly_report] = assembly_report
462
+ container[:structural_data] = structural_data
463
+ container[:status_report] = status_report
464
+ container[:taxonomy] = taxonomy
465
+ container[:database_report] = database_report
466
+ return container
467
+ end
468
+
469
+
470
+ def handle_data_reptrans_summary(stats_hash)
471
+ # GENERAL REPORT
472
+ #-------------------------------------------------------
473
+ all_seqs = 0
474
+ stats_hash.values.map{|v| all_seqs += v}
475
+ general_report = [
476
+ ['', 'Sequences', '%'],
477
+ ['Output', all_seqs],
478
+ ['Annotated with protein', stats_hash['prot_annotated']],
479
+ ['Annotated with EST', stats_hash['est_annotated']],
480
+ ['Coding test-code > 1', stats_hash['coding_>1']],
481
+ ['Coding test-code > 0.94', stats_hash['coding_>0.94']],
482
+ ['Coding test-code > 0.84', stats_hash['coding_>0.84']],
483
+ ['Coding test-code > 0.73', stats_hash['coding_>0.73']],
484
+ ['Coding test-code > 0', stats_hash['coding_>0']]
485
+ ]
486
+ add_percentages_by_scalar(general_report, 1, all_seqs)
487
+
488
+ # ACUMULATIVE REPORT
489
+ #-------------------------------------------------------
490
+ categories = [
491
+ 'Annotated with protein',
492
+ 'Annotated with EST',
493
+ 'Coding test-code > 1',
494
+ 'Coding test-code > 0.94',
495
+ 'Coding test-code > 0.84',
496
+ 'Coding test-code > 0.73',
497
+ 'Coding test-code > 0'
498
+ ]
499
+ values = [
500
+ stats_hash['prot_annotated'],
501
+ stats_hash['est_annotated'],
502
+ stats_hash['coding_>1'],
503
+ stats_hash['coding_>0.94'],
504
+ stats_hash['coding_>0.84'],
505
+ stats_hash['coding_>0.73'],
506
+ stats_hash['coding_>0']
507
+ ]
508
+ acumulative = []
509
+ acumulative << values.inject(0) { |result, element|
510
+ acumulative << result if result > 0
511
+ result + element
512
+ }
513
+ report = []
514
+ categories.each_with_index do |cat, i|
515
+ report << [cat, acumulative[i]]
516
+ end
517
+ acumulative_report = [
518
+ ['', 'Sequences', '%'],
519
+ ].concat(report)
520
+ add_percentages_by_scalar(acumulative_report, 1, all_seqs)
521
+
522
+ # BUILD CONTAINER
523
+ #-------------------------------------------------------
524
+ container = {}
525
+ container[:general_report] = general_report
526
+ container[:acumulative_report] = acumulative_report
527
+ return container
528
+ end
529
+
530
+ def add_percentages_by_vector(table, col, denominators)
531
+ table.each_with_index do |row, i|
532
+ next if i == 0 #Skip header
533
+ den = denominators[i-1]
534
+ perc = row[col]*100.0/denominators[i-1] if den > 0
535
+ if den > 0 && !perc.nan? && (perc).infinite?.nil?
536
+ percentage = '%.2f' % perc.to_s
537
+ percentage += '%'
538
+ else
539
+ percentage ='-'
540
+ end
541
+ row << percentage
542
+ end
543
+ end
544
+
545
+ def add_percentages_by_scalar(table, col, denominator)
546
+ table.each_with_index do |row, i|
547
+ next if i == 0 #Skip header
548
+ perc = row[col]*100.0/denominator
549
+ if !perc.nan? && perc.infinite?.nil?
550
+ percentage = '%.2f' % perc.to_s
551
+ percentage += '%'
552
+ else
553
+ percentage ='-'
554
+ end
555
+ row << percentage
556
+ end
557
+ end
558
+
559
+ def write_summary_stats(stats_hash, stats_taxonomy, stats_functional_annotation_by_seqs, diff_ids_array, diff_ids_complete_array, pre_fln_seq_lengths, seq_lengths, txt_file, html_file)
560
+ stats_hash = last_stats(stats_hash, diff_ids_array, diff_ids_complete_array, pre_fln_seq_lengths, seq_lengths)
561
+ write_txt(stats_hash, txt_file)
562
+ container = handle_data_main_summary(stats_hash, stats_taxonomy, stats_functional_annotation_by_seqs)
563
+ template = File.open(File.join(REPORT_FOLDER, 'general_summary.erb')).read
564
+ report = Report_html.new(container, 'FLN Summary')
565
+ report.build(template)
566
+ report.write(html_file)
567
+ end
568
+
569
+ def write_mapping_report(fpkm, coverage_analysis, stats_functional_annotation_by_seqs)
570
+ if !fpkm.empty? && !coverage_analysis.empty? # REPORT Mapping
571
+ container = go_for_graph(stats_functional_annotation_by_seqs, fpkm)
572
+ measured_coverages = coverage_analysis.values.map{|c| [c[1], c[2]]}
573
+ measured_coverages.sort!{|c1, c2| c2[1] <=> c1[1]}
574
+ measured_coverages.each_with_index do |cov, i|
575
+ cov.unshift(i+1) # Puts x axis: 1, 2, 3 ... (seqs)
576
+ end
577
+ measured_coverages.unshift(%w[transcripts mean_10max mean])
578
+ container[:mean_coverage] = measured_coverages
579
+ count = 0
580
+ container[:max10_coverage] = coverage_analysis.values.sort{|c1, c2| c2[1] <=> c1[1]}.map{|c| count += 1; [count, c[1]]}
581
+ container[:normalized_partial_coverage] = coverage_analysis.values.map{|c| [c[3], c[0]] }
582
+ mean_cov_trasn_cov = coverage_analysis.values.map{|data| [data[3], data[2]]}
583
+ mean_cov_trasn_cov.sort!{|i1, i2| i1[0] <=> i2[0]}
584
+ mean_cov_trasn_cov.unshift(%w[trans_cov mean_coverage])
585
+ container[:normalized_coverages_sorted_by_npc] = mean_cov_trasn_cov
586
+ template = File.open(File.join(REPORT_FOLDER, 'mapping_summary.erb')).read
587
+ report = Report_html.new(container, 'FLN Summary')
588
+ report.build(template)
589
+ report.write(File.join('fln_results', 'mapping_summary.html'))
590
+ end
591
+ end
592
+
593
+ def write_reptrans_stats(stats_hash, html_file, txt_file)
594
+ txt = File.open(txt_file,'w')
595
+ write_txt(stats_hash, txt)
596
+ container = handle_data_reptrans_summary(stats_hash)
597
+ template = File.open(File.join(REPORT_FOLDER, 'reptrans_summary.erb')).read
598
+ report = Report_html.new(container, 'FLN Reptrans Summary')
599
+ report.build(template)
600
+ report.write(html_file)
601
+ end
602
+
603
+ def write_txt(stats_hash, file)
604
+ stats_hash.each do |key, value|
605
+ file.puts "#{value}\t#{key}"
606
+ end
607
+ end
608
+
609
+ def table_title(title)
610
+ html = '<div style="font-size:25px; margin: 10"><b>'+title+'</b></div>'
611
+ return html
612
+ end
613
+ end