seqtrimnext_report 0.0.2

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,323 @@
1
+ class StatsReport
2
+
3
+ def initialize(all_params,initial_stats,stats,plugin_nts_hash,output_folder,output_latex)
4
+
5
+ output2=File.open(File.join(output_latex,'stats.tex'), 'w')
6
+ output2.puts "%!TEX root = FinalReport.tex"
7
+
8
+ input_seqs = stats['sequences']['count']['input_count'].to_i
9
+ rejected_seqs = stats['sequences']['count']['rejected'].to_i
10
+ output_seqs = stats['sequences']['count']['output_seqs'].to_i
11
+
12
+ output_seqs_paired = 0
13
+ total_output_seqs = 0
14
+ #------------------------------------------------------------------------------------------ solo cuando hay pareadas
15
+ if (!stats['sequences']['count']['output_seqs_paired'].nil?)
16
+ output_seqs_paired = stats['sequences']['count']['output_seqs_paired'].to_i
17
+ total_output_seqs = output_seqs_paired+output_seqs
18
+ end
19
+ #-------------------------------------------------------------------------------------------------------------------
20
+ low_complex = 0
21
+ if (!stats['sequences']['count']['output_seqs_low_complexity'].nil?)
22
+ low_complex = stats['sequences']['count']['output_seqs_low_complexity'].to_i # solo cuando hay baja complejidad (no hay cuando es genomico)
23
+ end
24
+ # graph files ----------------------------------------------------
25
+
26
+ # if File.exist?(File.join(output_latex,'graphs','size_stats.png'))
27
+ output2.puts '\input{input_graph}'
28
+ # end
29
+
30
+ # if File.exist?(File.join(output_latex,'graphs','qualities.png'))
31
+ output2.puts '\input{qv_graph}'
32
+ # end
33
+
34
+ # if File.exist?(File.join(output_latex,'graphs','PluginExtractInserts_insert_size.png'))
35
+ output2.puts '\input{output_graph}'+"\n\n"
36
+ # end
37
+ #------------------------------------------------------------------
38
+
39
+ (input_mode, output_mode) = get_mode(initial_stats,stats)
40
+ (input_mean, output_mean) = get_mean(initial_stats,stats)
41
+
42
+ #--------------------------------------------------------------------------- build table
43
+ output2.puts '\begin{table}[H]'
44
+ output2.puts '\begin{center}'
45
+ output2.puts '\begin{tabular}{l r r}'
46
+ output2.puts " \\hline"
47
+ output2.puts "Input reads: & total & #{input_seqs} \\\\"
48
+ output2.puts " & Smallest read (bp) & #{initial_stats['smallest_sequence_size'].to_i} \\\\"
49
+ output2.puts " & Largest read (bp)& #{initial_stats['biggest_sequence_size'].to_i} \\\\"
50
+ output2.puts " & Mode (bp) & #{input_mode} \\\\"
51
+ output2.puts " & Mean (bp)& #{input_mean} \\\\"
52
+
53
+ output2.puts " \\\\ \\hline"
54
+ output2.puts "Output results: & total & #{output_seqs} \\\\"
55
+ output2.puts " & Rejected & #{rejected_seqs} \\\\"
56
+ if (low_complex != 0)
57
+ output2.puts " & Low complexity reads & #{low_complex} \\\\"
58
+ end
59
+ output2.puts " & Mode (bp)& #{output_mode} \\\\"
60
+ output2.puts " & Mean (bp)& #{output_mean} \\\\"
61
+
62
+ #-------------------------------------------------- solo cuando hay pareadas
63
+ output2.puts "\\\\"
64
+ if (output_seqs_paired != 0)
65
+ output2.puts " & Output paired reads & #{output_seqs_paired} \\\\"
66
+ output2.puts " & Total output reads & #{total_output_seqs} \\\\"
67
+ output2.puts "\\\\ \\hline"
68
+ output2.puts "Linkers: & & \\\\"
69
+
70
+ if (!stats['PluginLinker'].nil?)
71
+ if (!stats['PluginLinker']['linker_id'].nil?)
72
+ stats['PluginLinker']['linker_id'].each do |linker|
73
+ output2.puts " & #{linker[0]} & #{linker[1]} \\\\"
74
+ end
75
+ end
76
+ output2.puts "\\\\ \\hline"
77
+ if (!stats['PluginLinker']['without_linker'].nil?)
78
+ output2.puts "Without linkers: & total & #{stats['PluginLinker']['without_linker']['0']} \\\\"
79
+ end
80
+
81
+ output2.puts "\\\\ \\hline"
82
+ output2.puts "Multiple linkers: & & \\\\"
83
+
84
+ if (!stats['PluginLinker']['multiple_linker_id'].nil?)
85
+ stats['PluginLinker']['multiple_linker_id'].each do |linker|
86
+ output2.puts " & #{linker[0]} & #{linker[1]} \\\\"
87
+ end
88
+ end
89
+ if (!stats['PluginLinker']['multiple_linker_count'].nil?)
90
+ stats['PluginLinker']['multiple_linker_count'].each do |linker|
91
+ output2.puts " & With #{linker[0]} linkers & #{linker[1]} \\\\"
92
+ end
93
+ end
94
+ end
95
+ end
96
+ #-----------------------------------------------------------------------------
97
+
98
+ output2.puts "\\hline"
99
+
100
+
101
+
102
+ output2.puts '\end{tabular}'
103
+ output2.puts '\label{table:nonlin}'
104
+ output2.puts '\end{center}'
105
+ output2.puts '\end{table}'+"\n\n"
106
+ #------------------------------------------------------------------------------- end table
107
+
108
+ #-------------------------------------------------- MIDs
109
+ if (!stats['PluginMids'].nil?) && (!stats['PluginMids']['mid_id'].nil?)
110
+ mid_seqs = stats['PluginMids']['mid_id']['total']
111
+ mid_seqs_percent = sprintf("%0.3f", (mid_seqs.to_f*100/input_seqs.to_f))
112
+ output2.puts '\noindent \begin{minipage}{\linewidth}'
113
+ output2.puts "number of reads with MID: #{mid_seqs} \(#{mid_seqs_percent}\\%\)"+'\\\\'+'\\\\'
114
+
115
+ if (mid_seqs_percent.to_f <= 1)
116
+ output2.puts '\fcolorbox{black}{yellow}{'+"\n"+'\begin{minipage}{\linewidth}{'+"\n"+'\textbf{WARNING: The number of reads with MID is so low that can be interpreted as a random finding. Your useful sequences are in the no\_MID folder, but you can also add any read classified as having a MID}'+"\n"+'}'+"\n"+'\end{minipage}'+"\n"+'}\\\\\\\\'
117
+ end
118
+ output2.puts '\end{minipage}'+"\n\n"
119
+ end
120
+ #----------------------------------------------------------------------------
121
+
122
+ #------------------------------------------------------- make top five tables
123
+ if !(stats['PluginVectors']).nil?
124
+ if !(top_hash = stats['PluginVectors']['vectors_ids']).nil?
125
+ make_a_top_five(output2, top_hash, 'Vectors')
126
+ end
127
+ end
128
+
129
+ if !(stats['PluginAbAdapters']).nil?
130
+ if !(top_hash = stats['PluginAbAdapters']['adapter_id']).nil?
131
+ make_a_top_five(output2, top_hash, 'Adapters')
132
+ end
133
+ end
134
+
135
+ if !(stats['PluginContaminants']).nil?
136
+ if !(top_hash = stats['PluginContaminants']['contaminants_ids']).nil?
137
+ make_a_top_five(output2, top_hash, 'Contaminants')
138
+ end
139
+ end
140
+ #--------------------------------------------------------------------------
141
+
142
+ # en las pareadas añadimos el inserto de izq y derecha ------------------------------------- solo cuando hay pareadas
143
+ paired_nts=0
144
+ if (stats['PluginExtractInserts']['left_insert_size']) || (stats['PluginExtractInserts']['right_insert_size'])
145
+ stats['PluginExtractInserts']['left_insert_size'].each do |element|
146
+ paired_nts += element[0].to_i*element[1].to_i
147
+ end
148
+ stats['PluginExtractInserts']['right_insert_size'].each do |element|
149
+ paired_nts += element[0].to_i*element[1].to_i
150
+ end
151
+ end
152
+ #-------------------------------------------------------------------------------------------------------------------
153
+
154
+ nts_total = initial_stats['nucleotide_count']
155
+ print_trimmed_nts_stats_table(stats, output2, plugin_nts_hash,nts_total,paired_nts)
156
+
157
+ output2.close
158
+
159
+ puts "Statistic information was added to the report"
160
+
161
+ end
162
+
163
+ def get_mode(initial_stats,stats)
164
+ output_mode = 0
165
+ mode_array = []
166
+ # take the mode from initial_stats.json
167
+ input_mode = initial_stats['mode_of_sizes']
168
+
169
+ # calculate the mode using data from stats.json
170
+ stats['PluginExtractInserts']['insert_size'].each do |key,value|
171
+ mode_array[key.to_i]=value
172
+ end
173
+
174
+ mode_array.map!{|e| e || 0}
175
+ s=ScbiStats.new(mode_array)
176
+
177
+ output_mode = s.fat_mode
178
+
179
+ return [input_mode, output_mode]
180
+ end
181
+
182
+ def get_mean(initial_stats,stats)
183
+ output_mean = 0
184
+
185
+ # take the mean from initial_stats.json
186
+ input_mean = sprintf("%0.1f", (initial_stats['mean_of_sequence_sizes']))
187
+ # input_mean = sprintf("%0.2f", (initial_stats['mean_of_sequence_sizes']))
188
+
189
+ # calculate the mean using data from stats.json
190
+ nts_count = 0
191
+ seqs_count = 0
192
+ stats['PluginExtractInserts']['insert_size'].each do |key,value|
193
+ seqs_count += value.to_i
194
+ nts_count += (key.to_f*value)
195
+ # nts_count += (key.to_f*value.to_f)
196
+ end
197
+
198
+
199
+ output_mean = sprintf("%0.1f", (nts_count/seqs_count))
200
+ # output_mean = sprintf("%0.2f", (nts_count/seqs_count))
201
+
202
+ return [input_mean, output_mean]
203
+ end
204
+
205
+ def make_a_top_five(output2,top_hash,name)
206
+ #-------------------------------------------------- build table
207
+ output2.puts '\begin{table}[H]'
208
+ output2.puts '\caption{'+"List of the most frequent~#{name}~found among your reads"+'}'
209
+ output2.puts '\vspace{-0.5cm}'
210
+ output2.puts '\begin{center}'
211
+ output2.puts '\begin{tabular}{|p{11cm}|r|}'
212
+ output2.puts '\hline'
213
+ output2.puts "#{name} " +'& sequences \\\\ [0.5ex]'
214
+ output2.puts '\hline'
215
+
216
+ cont = 0
217
+ top_hash.sort{|a,b| b[1]<=>a[1]}.each do |elem|
218
+ tmp_name = elem[0].gsub('_','\_')
219
+ output2.puts "#{tmp_name} \& #{elem[1]}"+'\\\\'
220
+ cont+=1
221
+ if (cont == 5)
222
+ break
223
+ end
224
+ end
225
+
226
+ output2.puts '\hline'
227
+ output2.puts '\end{tabular}'
228
+ output2.puts '\end{center}'
229
+ # output2.puts '\label{table:top5}'
230
+ output2.puts '\end{table}'+"\n\n"
231
+ #-------------------------------------------------- end table
232
+ end
233
+
234
+ def print_trimmed_nts_stats_table(stats, output2, plugin_nts_hash, nts_total,paired_nts)
235
+
236
+ nts_table_hash = {}
237
+ insert_array = []
238
+ warning_array = []
239
+
240
+ plugin_nts_hash.each do |plugin|
241
+
242
+ my_name = plugin[0]
243
+ plugin_name = plugin[1]['plugin']
244
+ plugin_field = plugin[1]['field']
245
+ plugin_msg = plugin[1]['msg']
246
+ plugin_threshold = plugin[1]['threshold']
247
+ plugin_warning = plugin[1]['warning']
248
+
249
+ if (!stats[plugin_name].nil?)
250
+ if (!stats[plugin_name][plugin_field].nil?)
251
+
252
+ count = 0
253
+ stats[plugin_name][plugin_field].each do |element|
254
+ count += element[0].to_i*element[1].to_i
255
+ end
256
+
257
+ if (plugin_name == 'PluginExtractInserts') && (plugin_field == 'insert_size') && (paired_nts > 0)
258
+ count += paired_nts
259
+ end
260
+
261
+ my_percent = sprintf("%0.3f", (count.to_f*100/nts_total.to_f))
262
+
263
+ if (plugin_name == 'PluginExtractInserts')
264
+ if (my_percent.to_f <= plugin_threshold)
265
+ plugin_msg.gsub!('my_percent',"#{my_percent}")
266
+ insert_array.push '\noindent \fcolorbox{black}{pink}{'+"\n"+'\begin{minipage}{\linewidth}{'+"\n"+'\textbf{'+"#{plugin_warning} #{plugin_msg}"+'}'+"\n"+'}'+"\n"+'\end{minipage}'+"\n"+'}\\\\\\\\'
267
+ else
268
+ plugin_warning = 'OK'
269
+ end
270
+ else
271
+ if (my_percent.to_f >= plugin_threshold)
272
+ plugin_msg.gsub!('my_percent',"#{my_percent}")
273
+ warning_array.push '\noindent \fcolorbox{black}{yellow}{'+"\n"+'\begin{minipage}{\linewidth}{'+"\n"+'\textbf{'+"#{plugin_warning} #{plugin_msg}"+'}'+"\n"+'}'+"\n"+'\end{minipage}'+"\n"+'}\\\\\\\\'
274
+ else
275
+ plugin_warning = 'OK'
276
+ end
277
+ end
278
+
279
+ nts_table_hash[plugin_field] = ["#{my_name}&#{count}&#{my_percent} \\%&#{plugin_warning}\\\\",my_percent]
280
+
281
+ # puts "#{plugin_name} #{plugin_field} #{count}"
282
+ end
283
+ end
284
+ end
285
+
286
+ #-------------------------------------------------- build table
287
+ output2.puts '\begin{table}[H]'
288
+ output2.puts '\caption{Summary of nucleotides removed in every plugin.}'
289
+ output2.puts '\begin{center}'
290
+ output2.puts '\begin{tabular}{l r r c}'
291
+ output2.puts '\hline'
292
+ output2.puts 'Plugin & Nucleotides & Percent & Warnings \\\\ [0.5ex]'
293
+ output2.puts '\hline'
294
+
295
+ #the hash of hashes is ordered by value (number of sequences rejected)
296
+ nts_table_ordered = nts_table_hash.sort {|a,b| b[1][1].to_i<=>a[1][1].to_i}
297
+
298
+ nts_table_ordered.each do |element|
299
+ if (element[0] != 'insert_size')
300
+ output2.puts element[1][0]
301
+ end
302
+ end
303
+
304
+ output2.puts '\hline'
305
+ output2.puts nts_table_hash['insert_size'][0]
306
+ output2.puts '\hline'
307
+ output2.puts '\end{tabular}'
308
+ output2.puts '\label{table:nonlin}'
309
+ output2.puts '\end{center}'
310
+ output2.puts '\end{table}'+"\n\n"
311
+ #-------------------------------------------------- end table
312
+
313
+ output2.puts '\noindent \begin{minipage}{\textwidth}'
314
+
315
+ output2.puts insert_array.join("\n")
316
+ output2.puts warning_array.join("\n")
317
+
318
+ output2.puts '\end{minipage}'+"\n\n"
319
+
320
+ end
321
+
322
+ end
323
+
@@ -0,0 +1,65 @@
1
+ {
2
+ "Low Quality": {
3
+ "plugin": "PluginLowQuality",
4
+ "field": "low_qual",
5
+ "msg": "Warning!, there are too many (my_percent \\%) low quality nucleotides",
6
+ "threshold": 10,
7
+ "warning": "ntW1"
8
+ },
9
+ "Low Complexity": {
10
+ "plugin": "PluginLowComplexity",
11
+ "field": "low_complexity",
12
+ "msg": "Warning!, there are too many (my_percent \\%) low complexity nucleotides",
13
+ "threshold": 1.5,
14
+ "warning": "ntW4"
15
+ },
16
+ "Poly T": {
17
+ "plugin": "PluginFindPolyAt",
18
+ "field": "poly_t_size",
19
+ "msg": "Warning!, too many nucleotides (my_percent \\%) are poly T",
20
+ "threshold": 1.5,
21
+ "warning": "ntW5"
22
+ },
23
+ "Poly A": {
24
+ "plugin": "PluginFindPolyAt",
25
+ "field": "poly_a_size",
26
+ "msg": "Warning!, too many nucleotides (my_percent \\%) are poly A",
27
+ "threshold": 1.5,
28
+ "warning": "ntW6"
29
+ },
30
+ "Contaminants": {
31
+ "plugin": "PluginContaminants",
32
+ "field": "contaminants_size",
33
+ "msg": "Warning!, too many nucleotides (my_percent \\%) come from a contaminant sequence",
34
+ "threshold": 0.75,
35
+ "warning": "ntW7"
36
+ },
37
+ "Adapters": {
38
+ "plugin": "PluginAbAdapters",
39
+ "field": "adapter_size",
40
+ "msg": "Warning!, too many nucleotides (my_percent \\%) come from adapters",
41
+ "threshold": 1.5,
42
+ "warning": "ntW3"
43
+ },
44
+ "Vectors": {
45
+ "plugin": "PluginVectors",
46
+ "field": "vector_size",
47
+ "msg": "Warning!, too many nucleotides (my_percent \\%) come from vectors",
48
+ "threshold": 0.75,
49
+ "warning": "ntW2"
50
+ },
51
+ "Indeterminations": {
52
+ "plugin": "PluginIndeterminations",
53
+ "field": "indetermination_size",
54
+ "msg": "Warning!, too many nucleotides (my_percent \\%) are indeterminations (Ns)",
55
+ "threshold": 0.01,
56
+ "warning": "ntW8"
57
+ },
58
+ "Inserts": {
59
+ "plugin": "PluginExtractInserts",
60
+ "field": "insert_size",
61
+ "msg": "Warning!, only my_percent \\% of nucleotides are useful",
62
+ "threshold": 50,
63
+ "warning": "iW1"
64
+ }
65
+ }
@@ -0,0 +1,69 @@
1
+ {
2
+ "contaminated": {
3
+ "name": "Contaminants",
4
+ "msg": "Warning!, a my_percent \\% of your sequences are from a contaminant organism or from organelles",
5
+ "threshold": 0.75,
6
+ "warning": "rdW4"
7
+ },
8
+ "short insert": {
9
+ "name": "Short inserts",
10
+ "msg": "Warning!, a my_percent \\% of your sequences are too short",
11
+ "threshold": 7.5,
12
+ "warning": "rdW2"
13
+ },
14
+ "low complexity by polyt": {
15
+ "name": "Low Complexity",
16
+ "msg": "Warning!, a my_percent \\% of your sequences are low complexity sequences",
17
+ "threshold": 1,
18
+ "warning": "rdW6"
19
+ },
20
+ "empty insert": {
21
+ "name": "Empty Inserts",
22
+ "msg": "Warning!, a my_percent \\% of your sequences are empty (without an insert)",
23
+ "threshold": 0.5,
24
+ "warning": "rdW3"
25
+ },
26
+ "No valid inserts found": {
27
+ "name": "No Valid Inserts",
28
+ "msg": "Warning!, a my_percent \\% of your sequences are no valid sequences",
29
+ "threshold": 0.05,
30
+ "warning": "rdW5"
31
+ },
32
+
33
+ "At least one N found": {
34
+ "name": "At least one N found",
35
+ "msg": "",
36
+ "threshold": 1,
37
+ "warning": ""
38
+ },
39
+ "Primer pair not found": {
40
+ "name": "Primer pair not found",
41
+ "msg": "",
42
+ "threshold": 1,
43
+ "warning": ""
44
+ },
45
+ "repeated": {
46
+ "name": "Repeated Sequences",
47
+ "msg": "Warning!, there are a my_percent \\% of repeated sequences",
48
+ "threshold": 9,
49
+ "warning": "rdW1"
50
+ },
51
+ "Indeterminations in middle of sequence": {
52
+ "name": "Indeterminations",
53
+ "msg": "Warning!, a my_percent \\% of your sequences contain too much indeterminations",
54
+ "threshold": 0.05,
55
+ "warning": "rdW8"
56
+ },
57
+ "unexpected vector": {
58
+ "name": "Unexpected Vector",
59
+ "msg": "Warning!, a my_percent \\% of your sequences contain a vector in an unexpected position",
60
+ "threshold": 0.01,
61
+ "warning": "rdW7"
62
+ },
63
+ "rejected": {
64
+ "name": "Total Rejected",
65
+ "msg": "Warning!, a my_percent \\% of your sequences were rejected!",
66
+ "threshold": 30,
67
+ "warning": "rdWT"
68
+ }
69
+ }