seqtrimnext_report 0.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,323 @@
1
+ class StatsReport
2
+
3
+ def initialize(all_params,initial_stats,stats,plugin_nts_hash,output_folder,output_latex)
4
+
5
+ output2=File.open(File.join(output_latex,'stats.tex'), 'w')
6
+ output2.puts "%!TEX root = FinalReport.tex"
7
+
8
+ input_seqs = stats['sequences']['count']['input_count'].to_i
9
+ rejected_seqs = stats['sequences']['count']['rejected'].to_i
10
+ output_seqs = stats['sequences']['count']['output_seqs'].to_i
11
+
12
+ output_seqs_paired = 0
13
+ total_output_seqs = 0
14
+ #------------------------------------------------------------------------------------------ solo cuando hay pareadas
15
+ if (!stats['sequences']['count']['output_seqs_paired'].nil?)
16
+ output_seqs_paired = stats['sequences']['count']['output_seqs_paired'].to_i
17
+ total_output_seqs = output_seqs_paired+output_seqs
18
+ end
19
+ #-------------------------------------------------------------------------------------------------------------------
20
+ low_complex = 0
21
+ if (!stats['sequences']['count']['output_seqs_low_complexity'].nil?)
22
+ low_complex = stats['sequences']['count']['output_seqs_low_complexity'].to_i # solo cuando hay baja complejidad (no hay cuando es genomico)
23
+ end
24
+ # graph files ----------------------------------------------------
25
+
26
+ # if File.exist?(File.join(output_latex,'graphs','size_stats.png'))
27
+ output2.puts '\input{input_graph}'
28
+ # end
29
+
30
+ # if File.exist?(File.join(output_latex,'graphs','qualities.png'))
31
+ output2.puts '\input{qv_graph}'
32
+ # end
33
+
34
+ # if File.exist?(File.join(output_latex,'graphs','PluginExtractInserts_insert_size.png'))
35
+ output2.puts '\input{output_graph}'+"\n\n"
36
+ # end
37
+ #------------------------------------------------------------------
38
+
39
+ (input_mode, output_mode) = get_mode(initial_stats,stats)
40
+ (input_mean, output_mean) = get_mean(initial_stats,stats)
41
+
42
+ #--------------------------------------------------------------------------- build table
43
+ output2.puts '\begin{table}[H]'
44
+ output2.puts '\begin{center}'
45
+ output2.puts '\begin{tabular}{l r r}'
46
+ output2.puts " \\hline"
47
+ output2.puts "Input reads: & total & #{input_seqs} \\\\"
48
+ output2.puts " & Smallest read (bp) & #{initial_stats['smallest_sequence_size'].to_i} \\\\"
49
+ output2.puts " & Largest read (bp)& #{initial_stats['biggest_sequence_size'].to_i} \\\\"
50
+ output2.puts " & Mode (bp) & #{input_mode} \\\\"
51
+ output2.puts " & Mean (bp)& #{input_mean} \\\\"
52
+
53
+ output2.puts " \\\\ \\hline"
54
+ output2.puts "Output results: & total & #{output_seqs} \\\\"
55
+ output2.puts " & Rejected & #{rejected_seqs} \\\\"
56
+ if (low_complex != 0)
57
+ output2.puts " & Low complexity reads & #{low_complex} \\\\"
58
+ end
59
+ output2.puts " & Mode (bp)& #{output_mode} \\\\"
60
+ output2.puts " & Mean (bp)& #{output_mean} \\\\"
61
+
62
+ #-------------------------------------------------- solo cuando hay pareadas
63
+ output2.puts "\\\\"
64
+ if (output_seqs_paired != 0)
65
+ output2.puts " & Output paired reads & #{output_seqs_paired} \\\\"
66
+ output2.puts " & Total output reads & #{total_output_seqs} \\\\"
67
+ output2.puts "\\\\ \\hline"
68
+ output2.puts "Linkers: & & \\\\"
69
+
70
+ if (!stats['PluginLinker'].nil?)
71
+ if (!stats['PluginLinker']['linker_id'].nil?)
72
+ stats['PluginLinker']['linker_id'].each do |linker|
73
+ output2.puts " & #{linker[0]} & #{linker[1]} \\\\"
74
+ end
75
+ end
76
+ output2.puts "\\\\ \\hline"
77
+ if (!stats['PluginLinker']['without_linker'].nil?)
78
+ output2.puts "Without linkers: & total & #{stats['PluginLinker']['without_linker']['0']} \\\\"
79
+ end
80
+
81
+ output2.puts "\\\\ \\hline"
82
+ output2.puts "Multiple linkers: & & \\\\"
83
+
84
+ if (!stats['PluginLinker']['multiple_linker_id'].nil?)
85
+ stats['PluginLinker']['multiple_linker_id'].each do |linker|
86
+ output2.puts " & #{linker[0]} & #{linker[1]} \\\\"
87
+ end
88
+ end
89
+ if (!stats['PluginLinker']['multiple_linker_count'].nil?)
90
+ stats['PluginLinker']['multiple_linker_count'].each do |linker|
91
+ output2.puts " & With #{linker[0]} linkers & #{linker[1]} \\\\"
92
+ end
93
+ end
94
+ end
95
+ end
96
+ #-----------------------------------------------------------------------------
97
+
98
+ output2.puts "\\hline"
99
+
100
+
101
+
102
+ output2.puts '\end{tabular}'
103
+ output2.puts '\label{table:nonlin}'
104
+ output2.puts '\end{center}'
105
+ output2.puts '\end{table}'+"\n\n"
106
+ #------------------------------------------------------------------------------- end table
107
+
108
+ #-------------------------------------------------- MIDs
109
+ if (!stats['PluginMids'].nil?) && (!stats['PluginMids']['mid_id'].nil?)
110
+ mid_seqs = stats['PluginMids']['mid_id']['total']
111
+ mid_seqs_percent = sprintf("%0.3f", (mid_seqs.to_f*100/input_seqs.to_f))
112
+ output2.puts '\noindent \begin{minipage}{\linewidth}'
113
+ output2.puts "number of reads with MID: #{mid_seqs} \(#{mid_seqs_percent}\\%\)"+'\\\\'+'\\\\'
114
+
115
+ if (mid_seqs_percent.to_f <= 1)
116
+ output2.puts '\fcolorbox{black}{yellow}{'+"\n"+'\begin{minipage}{\linewidth}{'+"\n"+'\textbf{WARNING: The number of reads with MID is so low that can be interpreted as a random finding. Your useful sequences are in the no\_MID folder, but you can also add any read classified as having a MID}'+"\n"+'}'+"\n"+'\end{minipage}'+"\n"+'}\\\\\\\\'
117
+ end
118
+ output2.puts '\end{minipage}'+"\n\n"
119
+ end
120
+ #----------------------------------------------------------------------------
121
+
122
+ #------------------------------------------------------- make top five tables
123
+ if !(stats['PluginVectors']).nil?
124
+ if !(top_hash = stats['PluginVectors']['vectors_ids']).nil?
125
+ make_a_top_five(output2, top_hash, 'Vectors')
126
+ end
127
+ end
128
+
129
+ if !(stats['PluginAbAdapters']).nil?
130
+ if !(top_hash = stats['PluginAbAdapters']['adapter_id']).nil?
131
+ make_a_top_five(output2, top_hash, 'Adapters')
132
+ end
133
+ end
134
+
135
+ if !(stats['PluginContaminants']).nil?
136
+ if !(top_hash = stats['PluginContaminants']['contaminants_ids']).nil?
137
+ make_a_top_five(output2, top_hash, 'Contaminants')
138
+ end
139
+ end
140
+ #--------------------------------------------------------------------------
141
+
142
+ # en las pareadas añadimos el inserto de izq y derecha ------------------------------------- solo cuando hay pareadas
143
+ paired_nts=0
144
+ if (stats['PluginExtractInserts']['left_insert_size']) || (stats['PluginExtractInserts']['right_insert_size'])
145
+ stats['PluginExtractInserts']['left_insert_size'].each do |element|
146
+ paired_nts += element[0].to_i*element[1].to_i
147
+ end
148
+ stats['PluginExtractInserts']['right_insert_size'].each do |element|
149
+ paired_nts += element[0].to_i*element[1].to_i
150
+ end
151
+ end
152
+ #-------------------------------------------------------------------------------------------------------------------
153
+
154
+ nts_total = initial_stats['nucleotide_count']
155
+ print_trimmed_nts_stats_table(stats, output2, plugin_nts_hash,nts_total,paired_nts)
156
+
157
+ output2.close
158
+
159
+ puts "Statistic information was added to the report"
160
+
161
+ end
162
+
163
+ def get_mode(initial_stats,stats)
164
+ output_mode = 0
165
+ mode_array = []
166
+ # take the mode from initial_stats.json
167
+ input_mode = initial_stats['mode_of_sizes']
168
+
169
+ # calculate the mode using data from stats.json
170
+ stats['PluginExtractInserts']['insert_size'].each do |key,value|
171
+ mode_array[key.to_i]=value
172
+ end
173
+
174
+ mode_array.map!{|e| e || 0}
175
+ s=ScbiStats.new(mode_array)
176
+
177
+ output_mode = s.fat_mode
178
+
179
+ return [input_mode, output_mode]
180
+ end
181
+
182
+ def get_mean(initial_stats,stats)
183
+ output_mean = 0
184
+
185
+ # take the mean from initial_stats.json
186
+ input_mean = sprintf("%0.1f", (initial_stats['mean_of_sequence_sizes']))
187
+ # input_mean = sprintf("%0.2f", (initial_stats['mean_of_sequence_sizes']))
188
+
189
+ # calculate the mean using data from stats.json
190
+ nts_count = 0
191
+ seqs_count = 0
192
+ stats['PluginExtractInserts']['insert_size'].each do |key,value|
193
+ seqs_count += value.to_i
194
+ nts_count += (key.to_f*value)
195
+ # nts_count += (key.to_f*value.to_f)
196
+ end
197
+
198
+
199
+ output_mean = sprintf("%0.1f", (nts_count/seqs_count))
200
+ # output_mean = sprintf("%0.2f", (nts_count/seqs_count))
201
+
202
+ return [input_mean, output_mean]
203
+ end
204
+
205
+ def make_a_top_five(output2,top_hash,name)
206
+ #-------------------------------------------------- build table
207
+ output2.puts '\begin{table}[H]'
208
+ output2.puts '\caption{'+"List of the most frequent~#{name}~found among your reads"+'}'
209
+ output2.puts '\vspace{-0.5cm}'
210
+ output2.puts '\begin{center}'
211
+ output2.puts '\begin{tabular}{|p{11cm}|r|}'
212
+ output2.puts '\hline'
213
+ output2.puts "#{name} " +'& sequences \\\\ [0.5ex]'
214
+ output2.puts '\hline'
215
+
216
+ cont = 0
217
+ top_hash.sort{|a,b| b[1]<=>a[1]}.each do |elem|
218
+ tmp_name = elem[0].gsub('_','\_')
219
+ output2.puts "#{tmp_name} \& #{elem[1]}"+'\\\\'
220
+ cont+=1
221
+ if (cont == 5)
222
+ break
223
+ end
224
+ end
225
+
226
+ output2.puts '\hline'
227
+ output2.puts '\end{tabular}'
228
+ output2.puts '\end{center}'
229
+ # output2.puts '\label{table:top5}'
230
+ output2.puts '\end{table}'+"\n\n"
231
+ #-------------------------------------------------- end table
232
+ end
233
+
234
+ def print_trimmed_nts_stats_table(stats, output2, plugin_nts_hash, nts_total,paired_nts)
235
+
236
+ nts_table_hash = {}
237
+ insert_array = []
238
+ warning_array = []
239
+
240
+ plugin_nts_hash.each do |plugin|
241
+
242
+ my_name = plugin[0]
243
+ plugin_name = plugin[1]['plugin']
244
+ plugin_field = plugin[1]['field']
245
+ plugin_msg = plugin[1]['msg']
246
+ plugin_threshold = plugin[1]['threshold']
247
+ plugin_warning = plugin[1]['warning']
248
+
249
+ if (!stats[plugin_name].nil?)
250
+ if (!stats[plugin_name][plugin_field].nil?)
251
+
252
+ count = 0
253
+ stats[plugin_name][plugin_field].each do |element|
254
+ count += element[0].to_i*element[1].to_i
255
+ end
256
+
257
+ if (plugin_name == 'PluginExtractInserts') && (plugin_field == 'insert_size') && (paired_nts > 0)
258
+ count += paired_nts
259
+ end
260
+
261
+ my_percent = sprintf("%0.3f", (count.to_f*100/nts_total.to_f))
262
+
263
+ if (plugin_name == 'PluginExtractInserts')
264
+ if (my_percent.to_f <= plugin_threshold)
265
+ plugin_msg.gsub!('my_percent',"#{my_percent}")
266
+ insert_array.push '\noindent \fcolorbox{black}{pink}{'+"\n"+'\begin{minipage}{\linewidth}{'+"\n"+'\textbf{'+"#{plugin_warning} #{plugin_msg}"+'}'+"\n"+'}'+"\n"+'\end{minipage}'+"\n"+'}\\\\\\\\'
267
+ else
268
+ plugin_warning = 'OK'
269
+ end
270
+ else
271
+ if (my_percent.to_f >= plugin_threshold)
272
+ plugin_msg.gsub!('my_percent',"#{my_percent}")
273
+ warning_array.push '\noindent \fcolorbox{black}{yellow}{'+"\n"+'\begin{minipage}{\linewidth}{'+"\n"+'\textbf{'+"#{plugin_warning} #{plugin_msg}"+'}'+"\n"+'}'+"\n"+'\end{minipage}'+"\n"+'}\\\\\\\\'
274
+ else
275
+ plugin_warning = 'OK'
276
+ end
277
+ end
278
+
279
+ nts_table_hash[plugin_field] = ["#{my_name}&#{count}&#{my_percent} \\%&#{plugin_warning}\\\\",my_percent]
280
+
281
+ # puts "#{plugin_name} #{plugin_field} #{count}"
282
+ end
283
+ end
284
+ end
285
+
286
+ #-------------------------------------------------- build table
287
+ output2.puts '\begin{table}[H]'
288
+ output2.puts '\caption{Summary of nucleotides removed in every plugin.}'
289
+ output2.puts '\begin{center}'
290
+ output2.puts '\begin{tabular}{l r r c}'
291
+ output2.puts '\hline'
292
+ output2.puts 'Plugin & Nucleotides & Percent & Warnings \\\\ [0.5ex]'
293
+ output2.puts '\hline'
294
+
295
+ #the hash of hashes is ordered by value (number of sequences rejected)
296
+ nts_table_ordered = nts_table_hash.sort {|a,b| b[1][1].to_i<=>a[1][1].to_i}
297
+
298
+ nts_table_ordered.each do |element|
299
+ if (element[0] != 'insert_size')
300
+ output2.puts element[1][0]
301
+ end
302
+ end
303
+
304
+ output2.puts '\hline'
305
+ output2.puts nts_table_hash['insert_size'][0]
306
+ output2.puts '\hline'
307
+ output2.puts '\end{tabular}'
308
+ output2.puts '\label{table:nonlin}'
309
+ output2.puts '\end{center}'
310
+ output2.puts '\end{table}'+"\n\n"
311
+ #-------------------------------------------------- end table
312
+
313
+ output2.puts '\noindent \begin{minipage}{\textwidth}'
314
+
315
+ output2.puts insert_array.join("\n")
316
+ output2.puts warning_array.join("\n")
317
+
318
+ output2.puts '\end{minipage}'+"\n\n"
319
+
320
+ end
321
+
322
+ end
323
+
@@ -0,0 +1,65 @@
1
+ {
2
+ "Low Quality": {
3
+ "plugin": "PluginLowQuality",
4
+ "field": "low_qual",
5
+ "msg": "Warning!, there are too many (my_percent \\%) low quality nucleotides",
6
+ "threshold": 10,
7
+ "warning": "ntW1"
8
+ },
9
+ "Low Complexity": {
10
+ "plugin": "PluginLowComplexity",
11
+ "field": "low_complexity",
12
+ "msg": "Warning!, there are too many (my_percent \\%) low complexity nucleotides",
13
+ "threshold": 1.5,
14
+ "warning": "ntW4"
15
+ },
16
+ "Poly T": {
17
+ "plugin": "PluginFindPolyAt",
18
+ "field": "poly_t_size",
19
+ "msg": "Warning!, too many nucleotides (my_percent \\%) are poly T",
20
+ "threshold": 1.5,
21
+ "warning": "ntW5"
22
+ },
23
+ "Poly A": {
24
+ "plugin": "PluginFindPolyAt",
25
+ "field": "poly_a_size",
26
+ "msg": "Warning!, too many nucleotides (my_percent \\%) are poly A",
27
+ "threshold": 1.5,
28
+ "warning": "ntW6"
29
+ },
30
+ "Contaminants": {
31
+ "plugin": "PluginContaminants",
32
+ "field": "contaminants_size",
33
+ "msg": "Warning!, too many nucleotides (my_percent \\%) come from a contaminant sequence",
34
+ "threshold": 0.75,
35
+ "warning": "ntW7"
36
+ },
37
+ "Adapters": {
38
+ "plugin": "PluginAbAdapters",
39
+ "field": "adapter_size",
40
+ "msg": "Warning!, too many nucleotides (my_percent \\%) come from adapters",
41
+ "threshold": 1.5,
42
+ "warning": "ntW3"
43
+ },
44
+ "Vectors": {
45
+ "plugin": "PluginVectors",
46
+ "field": "vector_size",
47
+ "msg": "Warning!, too many nucleotides (my_percent \\%) come from vectors",
48
+ "threshold": 0.75,
49
+ "warning": "ntW2"
50
+ },
51
+ "Indeterminations": {
52
+ "plugin": "PluginIndeterminations",
53
+ "field": "indetermination_size",
54
+ "msg": "Warning!, too many nucleotides (my_percent \\%) are indeterminations (Ns)",
55
+ "threshold": 0.01,
56
+ "warning": "ntW8"
57
+ },
58
+ "Inserts": {
59
+ "plugin": "PluginExtractInserts",
60
+ "field": "insert_size",
61
+ "msg": "Warning!, only my_percent \\% of nucleotides are useful",
62
+ "threshold": 50,
63
+ "warning": "iW1"
64
+ }
65
+ }
@@ -0,0 +1,69 @@
1
+ {
2
+ "contaminated": {
3
+ "name": "Contaminants",
4
+ "msg": "Warning!, a my_percent \\% of your sequences are from a contaminant organism or from organelles",
5
+ "threshold": 0.75,
6
+ "warning": "rdW4"
7
+ },
8
+ "short insert": {
9
+ "name": "Short inserts",
10
+ "msg": "Warning!, a my_percent \\% of your sequences are too short",
11
+ "threshold": 7.5,
12
+ "warning": "rdW2"
13
+ },
14
+ "low complexity by polyt": {
15
+ "name": "Low Complexity",
16
+ "msg": "Warning!, a my_percent \\% of your sequences are low complexity sequences",
17
+ "threshold": 1,
18
+ "warning": "rdW6"
19
+ },
20
+ "empty insert": {
21
+ "name": "Empty Inserts",
22
+ "msg": "Warning!, a my_percent \\% of your sequences are empty (without an insert)",
23
+ "threshold": 0.5,
24
+ "warning": "rdW3"
25
+ },
26
+ "No valid inserts found": {
27
+ "name": "No Valid Inserts",
28
+ "msg": "Warning!, a my_percent \\% of your sequences are no valid sequences",
29
+ "threshold": 0.05,
30
+ "warning": "rdW5"
31
+ },
32
+
33
+ "At least one N found": {
34
+ "name": "At least one N found",
35
+ "msg": "",
36
+ "threshold": 1,
37
+ "warning": ""
38
+ },
39
+ "Primer pair not found": {
40
+ "name": "Primer pair not found",
41
+ "msg": "",
42
+ "threshold": 1,
43
+ "warning": ""
44
+ },
45
+ "repeated": {
46
+ "name": "Repeated Sequences",
47
+ "msg": "Warning!, there are a my_percent \\% of repeated sequences",
48
+ "threshold": 9,
49
+ "warning": "rdW1"
50
+ },
51
+ "Indeterminations in middle of sequence": {
52
+ "name": "Indeterminations",
53
+ "msg": "Warning!, a my_percent \\% of your sequences contain too much indeterminations",
54
+ "threshold": 0.05,
55
+ "warning": "rdW8"
56
+ },
57
+ "unexpected vector": {
58
+ "name": "Unexpected Vector",
59
+ "msg": "Warning!, a my_percent \\% of your sequences contain a vector in an unexpected position",
60
+ "threshold": 0.01,
61
+ "warning": "rdW7"
62
+ },
63
+ "rejected": {
64
+ "name": "Total Rejected",
65
+ "msg": "Warning!, a my_percent \\% of your sequences were rejected!",
66
+ "threshold": 30,
67
+ "warning": "rdWT"
68
+ }
69
+ }