RubyGems - seqtrimnext_report - Versions diffs - 0.0.2 - Mend

seqtrimnext_report 0.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (25) hide show

data/History.txt +7 -0
data/Manifest.txt +24 -0
data/PostInstall.txt +7 -0
data/README.rdoc +49 -0
data/Rakefile +26 -0
data/bin/generate_report.rb +118 -0
data/lib/seqtrimnext_report.rb +12 -0
data/lib/seqtrimnext_report/classes/params_report.rb +84 -0
data/lib/seqtrimnext_report/classes/rejected_report.rb +207 -0
data/lib/seqtrimnext_report/classes/stats_report.rb +323 -0
data/lib/seqtrimnext_report/config/plugin_nts.json +65 -0
data/lib/seqtrimnext_report/config/plugin_seqs.json +69 -0
data/lib/seqtrimnext_report/latex_src/input_graph.tex +21 -0
data/lib/seqtrimnext_report/latex_src/main.tex +111 -0
data/lib/seqtrimnext_report/latex_src/output_files.tex +29 -0
data/lib/seqtrimnext_report/latex_src/output_graph.tex +22 -0
data/lib/seqtrimnext_report/latex_src/piescbi.jpg +0 -0
data/lib/seqtrimnext_report/latex_src/qv_graph.tex +21 -0
data/lib/seqtrimnext_report/latex_src/ref_seqs.png +0 -0
data/script/console +10 -0
data/script/destroy +14 -0
data/script/generate +14 -0
data/test/test_helper.rb +3 -0
data/test/test_seqtrimnext_report.rb +11 -0
metadata +103 -0

data/lib/seqtrimnext_report/classes/stats_report.rb ADDED

@@ -0,0 +1,323 @@
+class StatsReport
+	def initialize(all_params,initial_stats,stats,plugin_nts_hash,output_folder,output_latex)
+		output2=File.open(File.join(output_latex,'stats.tex'), 'w')
+		output2.puts "%!TEX root = FinalReport.tex"
+		input_seqs = stats['sequences']['count']['input_count'].to_i
+		rejected_seqs = stats['sequences']['count']['rejected'].to_i
+		output_seqs = stats['sequences']['count']['output_seqs'].to_i
+		output_seqs_paired = 0
+		total_output_seqs = 0
+		#------------------------------------------------------------------------------------------ solo cuando hay pareadas
+		if (!stats['sequences']['count']['output_seqs_paired'].nil?)
+			output_seqs_paired = stats['sequences']['count']['output_seqs_paired'].to_i
+			total_output_seqs = output_seqs_paired+output_seqs
+		end
+		#-------------------------------------------------------------------------------------------------------------------
+		low_complex = 0
+		if (!stats['sequences']['count']['output_seqs_low_complexity'].nil?)
+			low_complex = stats['sequences']['count']['output_seqs_low_complexity'].to_i # solo cuando hay baja complejidad (no hay cuando es genomico)
+		end
+		# graph files ----------------------------------------------------
+    # if File.exist?(File.join(output_latex,'graphs','size_stats.png'))
+			output2.puts '\input{input_graph}'
+    # end
+    # if File.exist?(File.join(output_latex,'graphs','qualities.png'))
+			output2.puts '\input{qv_graph}'
+    # end
+    # if File.exist?(File.join(output_latex,'graphs','PluginExtractInserts_insert_size.png'))
+			output2.puts '\input{output_graph}'+"\n\n"
+    # end
+		#------------------------------------------------------------------
+		(input_mode, output_mode) = get_mode(initial_stats,stats)
+		(input_mean, output_mean) = get_mean(initial_stats,stats)
+		#--------------------------------------------------------------------------- build table
+		output2.puts '\begin{table}[H]'
+		output2.puts '\begin{center}'
+		output2.puts '\begin{tabular}{l r r}'
+		output2.puts " \\hline"
+		output2.puts "Input reads: & total & #{input_seqs} \\\\"
+		output2.puts " & Smallest read (bp) & #{initial_stats['smallest_sequence_size'].to_i} \\\\"
+		output2.puts " & Largest read (bp)& #{initial_stats['biggest_sequence_size'].to_i} \\\\"
+		output2.puts " & Mode (bp) & #{input_mode} \\\\"
+		output2.puts " & Mean (bp)& #{input_mean} \\\\"
+		output2.puts " \\\\ \\hline"
+		output2.puts "Output results: & total & #{output_seqs} \\\\"
+		output2.puts " & Rejected & #{rejected_seqs} \\\\"
+		if (low_complex != 0)
+			output2.puts " & Low complexity reads & #{low_complex} \\\\"
+		end
+		output2.puts " & Mode (bp)& #{output_mode} \\\\"
+		output2.puts " & Mean (bp)& #{output_mean} \\\\"
+		#-------------------------------------------------- solo cuando hay pareadas
+		output2.puts "\\\\"
+		if (output_seqs_paired != 0)
+			output2.puts " & Output paired reads & #{output_seqs_paired} \\\\"
+			output2.puts " & Total output reads & #{total_output_seqs} \\\\"
+			output2.puts "\\\\ \\hline"
+			output2.puts "Linkers: & & \\\\"
+			if (!stats['PluginLinker'].nil?)
+				if (!stats['PluginLinker']['linker_id'].nil?)
+					stats['PluginLinker']['linker_id'].each do |linker|
+						output2.puts " & #{linker[0]} & #{linker[1]} \\\\"
+					end
+				end
+				output2.puts "\\\\ \\hline"
+				if (!stats['PluginLinker']['without_linker'].nil?)
+					output2.puts "Without linkers: & total & #{stats['PluginLinker']['without_linker']['0']} \\\\"
+				end
+				output2.puts "\\\\ \\hline"
+				output2.puts "Multiple linkers: & & \\\\"
+				if (!stats['PluginLinker']['multiple_linker_id'].nil?)
+					stats['PluginLinker']['multiple_linker_id'].each do |linker|
+						output2.puts " & #{linker[0]} & #{linker[1]} \\\\"
+					end
+				end
+				if (!stats['PluginLinker']['multiple_linker_count'].nil?)
+					stats['PluginLinker']['multiple_linker_count'].each do |linker|
+						output2.puts " & With #{linker[0]} linkers & #{linker[1]} \\\\"
+					end
+				end
+			end
+		end
+		#-----------------------------------------------------------------------------
+		output2.puts "\\hline"
+		output2.puts '\end{tabular}'
+		output2.puts '\label{table:nonlin}'
+		output2.puts '\end{center}'
+		output2.puts '\end{table}'+"\n\n"
+		#------------------------------------------------------------------------------- end table
+		#-------------------------------------------------- MIDs
+		if (!stats['PluginMids'].nil?) && (!stats['PluginMids']['mid_id'].nil?)
+			mid_seqs = stats['PluginMids']['mid_id']['total']
+			mid_seqs_percent = sprintf("%0.3f", (mid_seqs.to_f*100/input_seqs.to_f))
+			output2.puts '\noindent \begin{minipage}{\linewidth}'
+			output2.puts "number of reads with MID: #{mid_seqs} \(#{mid_seqs_percent}\\%\)"+'\\\\'+'\\\\'
+			if (mid_seqs_percent.to_f <= 1)
+				output2.puts '\fcolorbox{black}{yellow}{'+"\n"+'\begin{minipage}{\linewidth}{'+"\n"+'\textbf{WARNING: The number of reads with MID is so low that can be interpreted as a random finding. Your useful sequences are in the no\_MID folder, but you can also add any read classified as having a MID}'+"\n"+'}'+"\n"+'\end{minipage}'+"\n"+'}\\\\\\\\'
+			end
+			output2.puts '\end{minipage}'+"\n\n"
+		end
+		#----------------------------------------------------------------------------
+		#------------------------------------------------------- make top five tables
+		if !(stats['PluginVectors']).nil?
+			if !(top_hash = stats['PluginVectors']['vectors_ids']).nil?
+				make_a_top_five(output2, top_hash, 'Vectors')
+			end
+		end
+		if !(stats['PluginAbAdapters']).nil?
+			if !(top_hash = stats['PluginAbAdapters']['adapter_id']).nil?
+				make_a_top_five(output2, top_hash, 'Adapters')
+			end
+		end
+		if !(stats['PluginContaminants']).nil?
+			if !(top_hash = stats['PluginContaminants']['contaminants_ids']).nil?
+				make_a_top_five(output2, top_hash, 'Contaminants')
+			end
+		end
+		#--------------------------------------------------------------------------
+		# en las pareadas añadimos el inserto de izq y derecha ------------------------------------- solo cuando hay pareadas
+		paired_nts=0
+		if (stats['PluginExtractInserts']['left_insert_size']) || (stats['PluginExtractInserts']['right_insert_size'])
+			stats['PluginExtractInserts']['left_insert_size'].each do |element|
+				paired_nts += element[0].to_i*element[1].to_i
+			end
+			stats['PluginExtractInserts']['right_insert_size'].each do |element|
+				paired_nts += element[0].to_i*element[1].to_i
+			end
+		end
+		#-------------------------------------------------------------------------------------------------------------------
+		nts_total = initial_stats['nucleotide_count']
+		print_trimmed_nts_stats_table(stats, output2, plugin_nts_hash,nts_total,paired_nts)
+		output2.close
+		puts "Statistic information was added to the report"
+	end
+	def get_mode(initial_stats,stats)
+		output_mode = 0
+		mode_array = []
+		# take the mode from initial_stats.json
+		input_mode = initial_stats['mode_of_sizes']
+		# calculate the mode using data from stats.json
+		stats['PluginExtractInserts']['insert_size'].each do |key,value|
+			mode_array[key.to_i]=value
+		end
+		mode_array.map!{|e| e || 0}
+		s=ScbiStats.new(mode_array)
+		output_mode = s.fat_mode
+		return [input_mode, output_mode]
+	end
+	def get_mean(initial_stats,stats)
+		output_mean = 0
+		# take the mean from initial_stats.json
+		input_mean = sprintf("%0.1f", (initial_stats['mean_of_sequence_sizes']))
+		# input_mean = sprintf("%0.2f", (initial_stats['mean_of_sequence_sizes']))
+		# calculate the mean using data from stats.json
+		nts_count = 0
+		seqs_count = 0
+		stats['PluginExtractInserts']['insert_size'].each do |key,value|
+			seqs_count += value.to_i
+			nts_count += (key.to_f*value)
+			# nts_count += (key.to_f*value.to_f)
+		end
+		output_mean = sprintf("%0.1f", (nts_count/seqs_count))
+		# output_mean = sprintf("%0.2f", (nts_count/seqs_count))
+		return [input_mean, output_mean]
+	end
+	def make_a_top_five(output2,top_hash,name)
+#-------------------------------------------------- build table
+		output2.puts '\begin{table}[H]'
+		output2.puts '\caption{'+"List of the most frequent~#{name}~found among your reads"+'}'
+		output2.puts '\vspace{-0.5cm}'
+		output2.puts '\begin{center}'
+		output2.puts '\begin{tabular}{|p{11cm}|r|}'
+		output2.puts '\hline'
+		output2.puts "#{name} " +'& sequences \\\\ [0.5ex]'
+		output2.puts '\hline'
+		cont = 0
+		top_hash.sort{|a,b| b[1]<=>a[1]}.each do |elem|
+			tmp_name = elem[0].gsub('_','\_')
+		  output2.puts "#{tmp_name} \& #{elem[1]}"+'\\\\'
+			cont+=1
+			if (cont == 5)
+				break
+			end
+		end
+		output2.puts '\hline'
+		output2.puts '\end{tabular}'
+		output2.puts '\end{center}'
+		# output2.puts '\label{table:top5}'
+		output2.puts '\end{table}'+"\n\n"
+#-------------------------------------------------- end table
+	end
+	def print_trimmed_nts_stats_table(stats, output2, plugin_nts_hash, nts_total,paired_nts)
+		nts_table_hash = {}
+		insert_array = []
+		warning_array = []
+		plugin_nts_hash.each do |plugin|
+			my_name = plugin[0]
+			plugin_name = plugin[1]['plugin']
+			plugin_field = plugin[1]['field']
+			plugin_msg = plugin[1]['msg']
+			plugin_threshold = plugin[1]['threshold']
+			plugin_warning = plugin[1]['warning']
+			if (!stats[plugin_name].nil?)
+				if (!stats[plugin_name][plugin_field].nil?)
+					count = 0
+					stats[plugin_name][plugin_field].each do |element|
+						count += element[0].to_i*element[1].to_i
+					end
+					if (plugin_name == 'PluginExtractInserts') && (plugin_field == 'insert_size') && (paired_nts > 0)
+						count += paired_nts
+					end
+					my_percent = sprintf("%0.3f", (count.to_f*100/nts_total.to_f))
+					if (plugin_name == 'PluginExtractInserts')
+						if (my_percent.to_f <= plugin_threshold)
+							plugin_msg.gsub!('my_percent',"#{my_percent}")
+							insert_array.push '\noindent \fcolorbox{black}{pink}{'+"\n"+'\begin{minipage}{\linewidth}{'+"\n"+'\textbf{'+"#{plugin_warning}  #{plugin_msg}"+'}'+"\n"+'}'+"\n"+'\end{minipage}'+"\n"+'}\\\\\\\\'
+						else
+							plugin_warning = 'OK'
+						end
+					else
+						if (my_percent.to_f >= plugin_threshold)
+							plugin_msg.gsub!('my_percent',"#{my_percent}")
+							warning_array.push '\noindent \fcolorbox{black}{yellow}{'+"\n"+'\begin{minipage}{\linewidth}{'+"\n"+'\textbf{'+"#{plugin_warning}  #{plugin_msg}"+'}'+"\n"+'}'+"\n"+'\end{minipage}'+"\n"+'}\\\\\\\\'
+						else
+							plugin_warning = 'OK'
+						end
+					end
+					nts_table_hash[plugin_field] = ["#{my_name}&#{count}&#{my_percent} \\%&#{plugin_warning}\\\\",my_percent]
+					# puts "#{plugin_name} #{plugin_field} #{count}"
+				end
+			end
+		end
+#-------------------------------------------------- build table
+		output2.puts '\begin{table}[H]'
+		output2.puts '\caption{Summary of nucleotides removed in every plugin.}'
+		output2.puts '\begin{center}'
+		output2.puts '\begin{tabular}{l r r c}'
+		output2.puts '\hline'
+		output2.puts 'Plugin & Nucleotides & Percent & Warnings \\\\ [0.5ex]'
+		output2.puts '\hline'
+		#the hash of hashes is ordered by value (number of sequences rejected)
+		nts_table_ordered = nts_table_hash.sort {|a,b| b[1][1].to_i<=>a[1][1].to_i}
+		nts_table_ordered.each do |element|
+			if (element[0] != 'insert_size')
+				output2.puts element[1][0]
+			end
+		end
+		output2.puts '\hline'
+		output2.puts nts_table_hash['insert_size'][0]
+		output2.puts '\hline'
+		output2.puts '\end{tabular}'
+		output2.puts '\label{table:nonlin}'
+		output2.puts '\end{center}'
+		output2.puts '\end{table}'+"\n\n"
+#-------------------------------------------------- end table
+		output2.puts '\noindent \begin{minipage}{\textwidth}'
+		output2.puts insert_array.join("\n")
+		output2.puts warning_array.join("\n")
+		output2.puts '\end{minipage}'+"\n\n"
+	end
+end

data/lib/seqtrimnext_report/config/plugin_nts.json ADDED

@@ -0,0 +1,65 @@
+{
+  "Low Quality": {
+    "plugin": "PluginLowQuality",
+    "field": "low_qual",
+    "msg": "Warning!, there are too many (my_percent \\%) low quality nucleotides",
+    "threshold": 10,
+    "warning": "ntW1"
+  },
+  "Low Complexity": {
+    "plugin": "PluginLowComplexity",
+    "field": "low_complexity",
+    "msg": "Warning!, there are too many (my_percent \\%) low complexity nucleotides",
+    "threshold": 1.5,
+    "warning": "ntW4"
+  },
+  "Poly T": {
+    "plugin": "PluginFindPolyAt",
+    "field": "poly_t_size",
+    "msg": "Warning!, too many nucleotides (my_percent \\%) are poly T",
+    "threshold": 1.5,
+    "warning": "ntW5"
+  },
+  "Poly A": {
+    "plugin": "PluginFindPolyAt",
+    "field": "poly_a_size",
+    "msg": "Warning!, too many nucleotides (my_percent \\%) are poly A",
+    "threshold": 1.5,
+    "warning": "ntW6"
+  },
+  "Contaminants": {
+    "plugin": "PluginContaminants",
+    "field": "contaminants_size",
+    "msg": "Warning!, too many nucleotides (my_percent \\%) come from a contaminant sequence",
+    "threshold": 0.75,
+    "warning": "ntW7"
+  },
+  "Adapters": {
+    "plugin": "PluginAbAdapters",
+    "field": "adapter_size",
+    "msg": "Warning!, too many nucleotides (my_percent \\%) come from adapters",
+    "threshold": 1.5,
+    "warning": "ntW3"
+  },
+  "Vectors": {
+    "plugin": "PluginVectors",
+    "field": "vector_size",
+    "msg": "Warning!, too many nucleotides (my_percent \\%) come from vectors",
+    "threshold": 0.75,
+    "warning": "ntW2"
+  },
+  "Indeterminations": {
+    "plugin": "PluginIndeterminations",
+    "field": "indetermination_size",
+    "msg": "Warning!, too many nucleotides (my_percent \\%) are indeterminations (Ns)",
+    "threshold": 0.01,
+    "warning": "ntW8"
+  },
+  "Inserts": {
+    "plugin": "PluginExtractInserts",
+    "field": "insert_size",
+    "msg": "Warning!, only my_percent \\% of nucleotides are useful",
+    "threshold": 50,
+    "warning": "iW1"
+  }
+}

data/lib/seqtrimnext_report/config/plugin_seqs.json ADDED

@@ -0,0 +1,69 @@
+{
+  "contaminated": {
+    "name": "Contaminants",
+    "msg": "Warning!, a my_percent \\% of your sequences are from a contaminant organism or from organelles",
+    "threshold": 0.75,
+    "warning": "rdW4"
+  },
+  "short insert": {
+    "name": "Short inserts",
+    "msg": "Warning!, a my_percent \\% of your sequences are too short",
+    "threshold": 7.5,
+    "warning": "rdW2"
+  },
+  "low complexity by polyt": {
+    "name": "Low Complexity",
+    "msg": "Warning!, a my_percent \\% of your sequences are low complexity sequences",
+    "threshold": 1,
+    "warning": "rdW6"
+  },
+  "empty insert": {
+    "name": "Empty Inserts",
+    "msg": "Warning!, a my_percent \\% of your sequences are empty (without an insert)",
+    "threshold": 0.5,
+    "warning": "rdW3"
+  },
+  "No valid inserts found": {
+    "name": "No Valid Inserts",
+    "msg": "Warning!, a my_percent \\% of your sequences are no valid sequences",
+    "threshold": 0.05,
+    "warning": "rdW5"
+  },
+  "At least one N found": {
+    "name": "At least one N found",
+    "msg": "",
+    "threshold": 1,
+    "warning": ""
+  },
+    "Primer pair not found": {
+    "name": "Primer pair not found",
+    "msg": "",
+    "threshold": 1,
+    "warning": ""
+  },
+  "repeated": {
+    "name": "Repeated Sequences",
+    "msg": "Warning!, there are a my_percent \\% of repeated sequences",
+    "threshold": 9,
+    "warning": "rdW1"
+  },
+  "Indeterminations in middle of sequence": {
+    "name": "Indeterminations",
+    "msg": "Warning!, a my_percent \\% of your sequences contain too much indeterminations",
+    "threshold": 0.05,
+    "warning": "rdW8"
+  },
+  "unexpected vector": {
+    "name": "Unexpected Vector",
+    "msg": "Warning!, a my_percent \\% of your sequences contain a vector in an unexpected position",
+    "threshold": 0.01,
+    "warning": "rdW7"
+  },
+  "rejected": {
+    "name": "Total Rejected",
+    "msg": "Warning!, a my_percent \\% of your sequences were rejected!",
+    "threshold": 30,
+    "warning": "rdWT"
+  }
+}