RubyGems - DomFun - Versions diffs - 0.1.0 - Mend

DomFun 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (28) hide show

checksums.yaml +7 -0
data/.gitignore +11 -0
data/.rspec +3 -0
data/.travis.yml +7 -0
data/DomFun.gemspec +44 -0
data/Gemfile +4 -0
data/LICENSE.txt +21 -0
data/README.md +39 -0
data/Rakefile +6 -0
data/bin/add_protein_functional_families.rb +133 -0
data/bin/console +14 -0
data/bin/domains_to_function_predictor.rb +287 -0
data/bin/generate_CAFA2_dataset.rb +135 -0
data/bin/generate_CAFA2_tripartite_network.rb +139 -0
data/bin/generate_cafa_control.rb +45 -0
data/bin/get_kegg_pathways.R +12 -0
data/bin/lines.R +74 -0
data/bin/merge_pairs.rb +139 -0
data/bin/normalize_combined_scores.rb +118 -0
data/bin/prepare_cafa_network.rb +96 -0
data/bin/setup +8 -0
data/bin/standardize_scores.R +53 -0
data/bin/translate_kegg_genes2pathways.rb +98 -0
data/bin/validate_ProtFunSys_predictions.rb +174 -0
data/lib/DomFun.rb +6 -0
data/lib/DomFun/generalMethods.rb +105 -0
data/lib/DomFun/version.rb +3 -0
metadata +128 -0

data/bin/normalize_combined_scores.rb ADDED Viewed

@@ -0,0 +1,118 @@
+#! /usr/bin/env ruby
+##########################
+# Rojano E. & Seoane P., July 2019
+# Normalize combined scores for their use with CAFA validation
+##########################
+require 'optparse'
+##########################
+#METHODS
+##########################
+module Enumerable
+    def sum
+      self.inject(0){|accum, i| accum + i }
+    end
+    def mean
+      self.sum/self.length.to_f
+    end
+    def sample_variance
+      m = self.mean
+      sum = self.inject(0){|accum, i| accum +(i-m)**2 }
+      sum/(self.length - 1).to_f
+    end
+    def standard_deviation
+      Math.sqrt(self.sample_variance)
+    end
+end
+def load_predictions(input_file)
+	predictions_data = []
+	File.open(input_file).each do |line|
+		line.chomp!
+		protID, domains, funSys, combScore = line.split("\t")
+		predictions_data << [protID, domains, funSys, combScore.to_f]
+	end
+	return predictions_data
+end
+##########################
+#OPT-PARSER
+##########################
+options = {}
+OptionParser.new do |opts|
+	opts.banner = "Usage: #{__FILE__} [options]"
+	options[:input_file] = nil
+		opts.on("-a", "--input_file PATH", "Input file with association values to normalize") do |data|
+		options[:input_file] = data
+	end
+	options[:integration_method] = 'fisher'
+		opts.on("-i", "--integration_method STRING", "Integration method") do |data|
+		options[:integration_method] = data
+	end
+	options[:normalization_mode] = 'normal'
+		opts.on("-m", "--normalization_mode STRING", "Normalization mode: normal or max") do |data|
+		options[:normalization_mode] = data
+	end
+	options[:output_file] = 'normalized_associations.txt'
+		opts.on("-o", "--output_file PATH", "Output association file with normalized values") do |data|
+		options[:output_file] = data
+	end
+	opts.on_tail("-h", "--help", "Tool information") do
+		puts opts
+		exit
+	end
+end.parse!
+##########################
+#MAIN
+##########################
+predictions_data = load_predictions(options[:input_file])
+standardized_data = []
+if options[:integration_method] == 'fisher'
+	predictions_data.each do |protID, domains, funSys, combScore|
+		stdScore = 1 - combScore
+		#CAFA validation score must be in range (0,1]
+		#Remember that fisher's integration most significant value is 0
+		standardized_data << [protID, domains, funSys, stdScore] if stdScore > 0.001
+	end
+else
+	#https://www.researchgate.net/post/How_do_i_normalize_data_from_0_to_1_range
+	combScores = predictions_data.map{|a| a[3] }
+	combScoresAverage = combScores.mean
+	combScoresSD = combScores.standard_deviation
+	maxCombScore = combScores.max
+	predictions_data.each do |protID, domains, funSys, combScore|
+		if options[:normalization_mode] == 'normal'
+			score = (combScore - combScoresAverage).fdiv(combScoresSD)
+			if score > 2
+				score = 2
+			elsif score < -2
+				score = -2
+			end
+			stdScore = score.fdiv(4) + 0.5
+		elsif options[:normalization_mode] == 'max'
+			stdScore = combScore.fdiv(maxCombScore)
+		end
+		#CAFA validation score must be in range (0,1]
+		standardized_data << [protID, domains, funSys, stdScore] #if stdScore > 0.001
+	end
+end
+handler = File.open(options[:output_file], 'w')
+standardized_data.each do |data|
+	handler.puts data.join("\t")
+end
+handler.close

data/bin/prepare_cafa_network.rb ADDED Viewed

@@ -0,0 +1,96 @@
+#! /usr/bin/env ruby
+##########################
+# Rojano E. & Seoane P., July 2019
+# Generate GO tripartite networks filtered with CAFA data
+# For its use as system control (only GO)
+##########################
+REPORT_FOLDER=File.expand_path(File.join(File.dirname(__FILE__), '..', 'templates'))
+ROOT_PATH = File.dirname(__FILE__)
+$: << File.expand_path(File.join(ROOT_PATH, '..', 'lib', 'DomFun'))
+require 'optparse'
+require 'generalMethods.rb'
+##########################
+#METHODS
+##########################
+def load_network_data(network)
+	go_gene_rels = {}
+	domain_gene_rels = {}
+	File.open(network).each do |line|
+		line.chomp!
+		term, gene = line.split("\t")
+		if term.include?('GO:')
+			query = go_gene_rels[gene]
+			if query.nil?
+				go_gene_rels[gene] = [term]
+			else
+				query << term
+			end
+		else
+			query = domain_gene_rels[gene]
+			if query.nil?
+				domain_gene_rels[gene] = [term]
+			else
+				query << term
+			end
+		end
+	end
+	return go_gene_rels, domain_gene_rels
+end
+def check_genes(term_protein_rels, cafa_data)
+	cafa_data.each do |gene, go_list|
+		term_protein_rels.delete(gene)
+	end
+	return term_protein_rels
+end
+##########################
+#OPT-PARSER
+##########################
+options = {}
+OptionParser.new do |opts|
+  opts.banner = "Usage: #{__FILE__} [options]"
+  options[:cafa_data] = nil
+  opts.on("-a", "--cafa_data PATH", "Input CAFA gene annotations") do |data|
+    options[:cafa_data] = data
+  end
+  options[:input_network] = nil
+  opts.on("-n", "--input_network PATH", "Input network to parse") do |data|
+    options[:input_network] = data
+  end
+  options[:output_network] = 'cafa_network.txt'
+  opts.on("-o", "--output_network PATH", "Output network without CAFA proteins") do |data|
+    options[:output_network] = data
+  end
+  opts.on_tail("-h", "--help", "Show tool help") do
+  	puts opts
+  	exit
+  end
+end.parse!
+##########################
+#MAIN
+##########################
+cafa_data = load_cafa_data(options[:cafa_data])
+go_gene_rels, domain_gene_rels = load_network_data(options[:input_network])
+genes2gos_layer = check_genes(go_gene_rels, cafa_data)
+genes2domains_layer = check_genes(domain_gene_rels, cafa_data)
+handler = File.open(options[:output_network], 'w')
+genes2gos_layer.each do |gene, go_list|
+	go_list.each do |go_term|
+		handler.puts "#{go_term}\t#{gene}"
+	end
+end
+genes2domains_layer.each do |gene, domains|
+	domains.each do |domain|
+		handler.puts "#{domain}\t#{gene}"
+	end
+end
+handler.close

data/bin/setup ADDED Viewed

@@ -0,0 +1,8 @@
+#!/usr/bin/env bash
+set -euo pipefail
+IFS=$'\n\t'
+set -vx
+bundle install
+# Do any other automated setup that you need to do here

data/bin/standardize_scores.R ADDED Viewed

@@ -0,0 +1,53 @@
+#! /usr/bin/env Rscript
+library(optparse)
+library(ggplot2)
+#####################
+## OPTPARSE
+#####################
+option_list <- list(
+	make_option(c("-d", "--data_file"), type="character",
+		help="Tabulated file with information about each sample"),
+	make_option(c("-o", "--output"), type="character", default="results",
+		help="Output figure file"),
+	make_option(c("-e", "--external_score"), type="double", default=NULL,
+		help="Use external score"),
+    make_option(c("-s", "--set_column"), type="character", default="",
+       	help="Name of column to be converted to Z-scores"),
+    make_option(c("-p", "--plot_distribution"), action="store_true", default=FALSE,
+		help="Print plot distribution")
+)
+opt <- parse_args(OptionParser(option_list=option_list))
+################################################################
+## MAIN
+################################################################
+data <- read.table(opt$data_file, sep="\t", header=FALSE)
+raw_data <- data[[opt$set_column]]
+if(!is.null(opt$external_score)){
+	raw_data <- c(opt$external_score, raw_data)
+}
+if(opt$plot_distribution){
+	dataframe <- as.data.frame(raw_data)
+	colnames(dataframe) <- c("AssociationValue")
+	plot <- ggplot(dataframe, aes(y=AssociationValue)) +
+		geom_boxplot()
+	print(ggplot_build(plot))
+	quit(save = "default", status = 0, runLast = TRUE)
+}
+#print(ggplot_build(plot))
+#message(mean(raw_data))
+z_scores = scale(raw_data, center=TRUE, scale=TRUE)
+ if(!is.null(opt$external_score)){
+	external_score2z_score <- z_scores[1]
+	cat(sep="","ExtZScore\t",external_score2z_score,"\n")
+	z_scores <- z_scores[-1] #remove external score
+}
+data[[opt$set_column]] <- z_scores
+write.table(data, file=opt$output, quote=FALSE, sep="\t", row.names=FALSE, col.names=FALSE)

data/bin/translate_kegg_genes2pathways.rb ADDED Viewed

@@ -0,0 +1,98 @@
+#! /usr/bin/env ruby
+##########################
+# Rojano E. & Seoane P., July 2019
+# Translate KEGG genes into pathways
+##########################
+REPORT_FOLDER=File.expand_path(File.join(File.dirname(__FILE__), '..', 'templates'))
+ROOT_PATH = File.dirname(__FILE__)
+$: << File.expand_path(File.join(ROOT_PATH, '..', 'lib', 'DomFun'))
+# require 'generalMethods.rb'
+require 'optparse'
+##########################
+#METHODS
+##########################
+def load_kegg_dictionary(pathway_to_genes_file)
+	kegg_dictionary = {}
+	File.open(pathway_to_genes_file).each do |line|
+		line.chomp!
+		keggGeneID, keggPathwayID = line.split("\t")
+		query = kegg_dictionary[keggGeneID]
+		if query.nil?
+			kegg_dictionary[keggGeneID] = [keggPathwayID]
+		else
+			query << keggPathwayID
+		end
+	end
+	return kegg_dictionary
+end
+def load_network(network_kegg_file)
+	network_kegg = []
+	superfamily_ids = []
+	File.open(network_kegg_file).each do |line|
+		line.chomp!
+		kegg_gene_ID, gene = line.split("\t")
+		if kegg_gene_ID.include?('hsa:')
+			network_kegg << [kegg_gene_ID, gene]
+		else
+			superfamily_ids << [kegg_gene_ID, gene]
+		end
+	end
+	return network_kegg, superfamily_ids
+end
+##########################
+#OPT-PARSER
+##########################
+options = {}
+OptionParser.new do |opts|
+  opts.banner = "Usage: #{__FILE__} [options]"
+  options[:kegg_pathways] = nil
+  opts.on("-k", "--kegg_pathways PATH", "Input file with KEGG genes to pathways") do |data|
+    options[:kegg_pathways] = data
+  end
+  options[:network_kegg] = nil
+  opts.on("-n", "--network_kegg PATH", "Network with KEGG genes to translate into pathways") do |data|
+    options[:network_kegg] = data
+  end
+  options[:output_path] = 'network_kegg_pathways'
+  opts.on("-o", "--output_path PATH", "Resulting network output path") do |data|
+    options[:output_path] = data
+  end
+  opts.on_tail("-h", "--help", "Show this message") do
+    puts opts
+    exit
+  end
+end.parse!
+##########################
+#MAIN
+##########################
+kegg_dictionary = load_kegg_dictionary(options[:kegg_pathways])
+network_kegg, superfamily_ids = load_network(options[:network_kegg])
+pathways_network = []
+network_kegg.each do |kegg_gene_ID, proteinID|
+	pathwayIDs = kegg_dictionary[kegg_gene_ID]
+	unless pathwayIDs.nil?
+		pathwayIDs.each do |pathway|
+			pathways_network << [pathway, proteinID]
+		end
+	end
+end
+handler = File.open(options[:output_path], 'w')
+pathways_network.each do |pair|
+	handler.puts pair.join("\t")
+end
+superfamily_ids.each do |pair|
+	handler.puts pair.join("\t")
+end
+handler.close

data/bin/validate_ProtFunSys_predictions.rb ADDED Viewed

@@ -0,0 +1,174 @@
+#! /usr/bin/env ruby
+##########################
+# Rojano E. & Seoane P., June 2019
+# Domain to functional annotation predictor validation system
+# The script uses the predictions file and proteins-FunSys from UniProtKB
+# It compares the predictions with the proteins-FunSys to validate the functioning of the predictor
+# Generate values to plot in a PR
+##########################
+REPORT_FOLDER=File.expand_path(File.join(File.dirname(__FILE__), '..', 'templates'))
+ROOT_PATH = File.dirname(__FILE__)
+$: << File.expand_path(File.join(ROOT_PATH, '..', 'lib', 'DomFun'))
+require 'generalMethods.rb'
+require 'optparse'
+require "statistics2"
+require 'bigdecimal'
+##########################
+#METHODS
+##########################
+def load_predictions_file(predictions_file)
+	predictions = []
+	File.open(predictions_file).each do |line|
+		line.chomp!
+		next if line.include?('ProteinID')
+		protein, domains, funSys, combinedScore = line.split("\t")
+		predictions << [protein, funSys, combinedScore.to_f]
+	end
+	return predictions
+end
+def load_control_file(control_file)
+	control_protein_FunSys = {}
+	File.open(control_file).each do |line|
+		line.chomp!
+		proteinID, funSys = line.split("\t")
+		control_protein_FunSys[proteinID] = funSys.split(';')
+	end
+	return control_protein_FunSys
+end
+def load_prediction(pairs_array)
+	pred = {}
+	min = nil
+	max = nil
+	pairs_array.each do |key, label, score| #protein, FunSys, assocScore
+		query = pred[key]
+		if !min.nil? && !max.nil?
+			min = score if score < min
+			max = score if score > max
+		else
+			min = score; max = score
+		end
+		if query.nil?
+			pred[key] = [[label], [score]]
+		else
+			query.first << label
+			query.last << score
+		end
+	end
+	return pred, [min, max]
+end
+# Pandey 2007, Association Analysis-based Transformations for Protein Interaction Networks: A Function Prediction Case Study
+def get_pred_rec(meth, cut_number = 100, top_number = 10000, control_protein_FunSys, predictions)
+	performance = [] #cut, pred, rec
+	preds, limits = load_prediction(predictions)
+	cuts = get_cuts(limits, cut_number)
+	cuts.each do |cut|
+		prec, rec = pred_rec(preds, cut, top_number, control_protein_FunSys)
+		performance << [cut, prec, rec]
+	end
+	return performance
+end
+def pred_rec(preds, cut, top, control_protein_FunSys)
+	predicted_labels = 0 #m
+	true_labels = 0 #n
+	common_labels = 0 # k
+	control_protein_FunSys.each do |key, c_labels|
+		true_labels += c_labels.length #n
+		pred_info = preds[key]
+		if !pred_info.nil?
+			labels, scores = pred_info
+			reliable_labels = get_reliable_labels(labels, scores, cut, top)
+			predicted_labels += reliable_labels.length #m
+			common_labels += (c_labels & reliable_labels).length #k
+		end
+	end
+	#puts "cut: #{cut} trueL: #{true_labels} predL: #{predicted_labels} commL: #{common_labels}"
+	prec = common_labels.to_f/predicted_labels
+	rec = common_labels.to_f/true_labels
+	prec = 0.0 if prec.nan?
+	rec = 0.0 if rec.nan?
+	return prec, rec
+end
+def get_cuts(limits, n_cuts)
+	cuts = []
+	range = (limits.last - limits.first).abs.fdiv(n_cuts)
+	range = BigDecimal(range, 10)
+	cut = limits.first
+	(n_cuts + 1).times do |n|
+		cuts << (cut + n * range).to_f
+	end
+	return cuts
+end
+def get_reliable_labels(labels, scores, cut, top)
+	reliable_labels = []
+	scores.each_with_index do |score, i|
+		reliable_labels << [labels[i], score] if score >= cut
+	end
+	reliable_labels = reliable_labels.sort!{|l1,l2| l2.last <=> l1.last}[0..top-1].map{|pred| pred.first}
+	return reliable_labels
+end
+##########################
+#OPT-PARSER
+##########################
+options = {}
+OptionParser.new do |opts|
+  opts.banner = "Usage: #{__FILE__} [options]"
+	options[:input_predictions] = nil
+		opts.on("-a", "--input_predictions PATH", "Domain-function predictions") do |data|
+		options[:input_predictions] = data
+	end
+	options[:control_file] = nil
+		opts.on("-c", "--control_file PATH", "Control dataset with proteins-FunSys from UniProtKB") do |data|
+		options[:control_file] = data
+	end
+	options[:assoc_meth] = nil
+		opts.on("-m", "--assoc_meth STRING", "Association method used") do |data|
+		options[:assoc_meth] = data
+	end
+	options[:performance_file] = 'precision_recall.txt'
+	opts.on("-p", "--performance_file PATH", "Output file with PR values") do |data|
+	options[:performance_file] = data
+	end
+	opts.on_tail("-h", "--help", "Show this message") do
+		puts opts
+		exit
+	end
+end.parse!
+##########################
+#MAIN
+##########################
+control_protein_FunSys = load_control_file(options[:control_file])
+domains_FunSys_predictions = load_predictions_file(options[:input_predictions])
+performance = get_pred_rec(options[:assoc_meth], 100, 10000, control_protein_FunSys, domains_FunSys_predictions)
+File.open(options[:performance_file], 'w') do |f|
+	f.puts %w[cut prec rec meth].join("\t")
+	performance.each do |item|
+		item << options[:assoc_meth].to_s
+		f.puts item.join("\t")
+	end
+end