DomFun 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,118 @@
1
+ #! /usr/bin/env ruby
2
+ ##########################
3
+ # Rojano E. & Seoane P., July 2019
4
+ # Normalize combined scores for their use with CAFA validation
5
+ ##########################
6
+
7
+ require 'optparse'
8
+
9
+ ##########################
10
+ #METHODS
11
+ ##########################
12
+ module Enumerable
13
+
14
+ def sum
15
+ self.inject(0){|accum, i| accum + i }
16
+ end
17
+
18
+ def mean
19
+ self.sum/self.length.to_f
20
+ end
21
+
22
+ def sample_variance
23
+ m = self.mean
24
+ sum = self.inject(0){|accum, i| accum +(i-m)**2 }
25
+ sum/(self.length - 1).to_f
26
+ end
27
+
28
+ def standard_deviation
29
+ Math.sqrt(self.sample_variance)
30
+ end
31
+
32
+ end
33
+
34
+ def load_predictions(input_file)
35
+ predictions_data = []
36
+ File.open(input_file).each do |line|
37
+ line.chomp!
38
+ protID, domains, funSys, combScore = line.split("\t")
39
+ predictions_data << [protID, domains, funSys, combScore.to_f]
40
+ end
41
+ return predictions_data
42
+ end
43
+
44
+ ##########################
45
+ #OPT-PARSER
46
+ ##########################
47
+ options = {}
48
+ OptionParser.new do |opts|
49
+ opts.banner = "Usage: #{__FILE__} [options]"
50
+
51
+ options[:input_file] = nil
52
+ opts.on("-a", "--input_file PATH", "Input file with association values to normalize") do |data|
53
+ options[:input_file] = data
54
+ end
55
+
56
+ options[:integration_method] = 'fisher'
57
+ opts.on("-i", "--integration_method STRING", "Integration method") do |data|
58
+ options[:integration_method] = data
59
+ end
60
+
61
+ options[:normalization_mode] = 'normal'
62
+ opts.on("-m", "--normalization_mode STRING", "Normalization mode: normal or max") do |data|
63
+ options[:normalization_mode] = data
64
+ end
65
+
66
+ options[:output_file] = 'normalized_associations.txt'
67
+ opts.on("-o", "--output_file PATH", "Output association file with normalized values") do |data|
68
+ options[:output_file] = data
69
+ end
70
+
71
+ opts.on_tail("-h", "--help", "Tool information") do
72
+ puts opts
73
+ exit
74
+ end
75
+
76
+ end.parse!
77
+
78
+ ##########################
79
+ #MAIN
80
+ ##########################
81
+ predictions_data = load_predictions(options[:input_file])
82
+ standardized_data = []
83
+
84
+ if options[:integration_method] == 'fisher'
85
+ predictions_data.each do |protID, domains, funSys, combScore|
86
+ stdScore = 1 - combScore
87
+ #CAFA validation score must be in range (0,1]
88
+ #Remember that fisher's integration most significant value is 0
89
+ standardized_data << [protID, domains, funSys, stdScore] if stdScore > 0.001
90
+ end
91
+ else
92
+ #https://www.researchgate.net/post/How_do_i_normalize_data_from_0_to_1_range
93
+ combScores = predictions_data.map{|a| a[3] }
94
+ combScoresAverage = combScores.mean
95
+ combScoresSD = combScores.standard_deviation
96
+ maxCombScore = combScores.max
97
+ predictions_data.each do |protID, domains, funSys, combScore|
98
+ if options[:normalization_mode] == 'normal'
99
+ score = (combScore - combScoresAverage).fdiv(combScoresSD)
100
+ if score > 2
101
+ score = 2
102
+ elsif score < -2
103
+ score = -2
104
+ end
105
+ stdScore = score.fdiv(4) + 0.5
106
+ elsif options[:normalization_mode] == 'max'
107
+ stdScore = combScore.fdiv(maxCombScore)
108
+ end
109
+ #CAFA validation score must be in range (0,1]
110
+ standardized_data << [protID, domains, funSys, stdScore] #if stdScore > 0.001
111
+ end
112
+ end
113
+
114
+ handler = File.open(options[:output_file], 'w')
115
+ standardized_data.each do |data|
116
+ handler.puts data.join("\t")
117
+ end
118
+ handler.close
@@ -0,0 +1,96 @@
1
+ #! /usr/bin/env ruby
2
+ ##########################
3
+ # Rojano E. & Seoane P., July 2019
4
+ # Generate GO tripartite networks filtered with CAFA data
5
+ # For its use as system control (only GO)
6
+ ##########################
7
+
8
+ REPORT_FOLDER=File.expand_path(File.join(File.dirname(__FILE__), '..', 'templates'))
9
+ ROOT_PATH = File.dirname(__FILE__)
10
+ $: << File.expand_path(File.join(ROOT_PATH, '..', 'lib', 'DomFun'))
11
+ require 'optparse'
12
+ require 'generalMethods.rb'
13
+
14
+ ##########################
15
+ #METHODS
16
+ ##########################
17
+
18
+ def load_network_data(network)
19
+ go_gene_rels = {}
20
+ domain_gene_rels = {}
21
+ File.open(network).each do |line|
22
+ line.chomp!
23
+ term, gene = line.split("\t")
24
+ if term.include?('GO:')
25
+ query = go_gene_rels[gene]
26
+ if query.nil?
27
+ go_gene_rels[gene] = [term]
28
+ else
29
+ query << term
30
+ end
31
+ else
32
+ query = domain_gene_rels[gene]
33
+ if query.nil?
34
+ domain_gene_rels[gene] = [term]
35
+ else
36
+ query << term
37
+ end
38
+ end
39
+ end
40
+ return go_gene_rels, domain_gene_rels
41
+ end
42
+
43
+ def check_genes(term_protein_rels, cafa_data)
44
+ cafa_data.each do |gene, go_list|
45
+ term_protein_rels.delete(gene)
46
+ end
47
+ return term_protein_rels
48
+ end
49
+ ##########################
50
+ #OPT-PARSER
51
+ ##########################
52
+ options = {}
53
+ OptionParser.new do |opts|
54
+ opts.banner = "Usage: #{__FILE__} [options]"
55
+
56
+ options[:cafa_data] = nil
57
+ opts.on("-a", "--cafa_data PATH", "Input CAFA gene annotations") do |data|
58
+ options[:cafa_data] = data
59
+ end
60
+
61
+ options[:input_network] = nil
62
+ opts.on("-n", "--input_network PATH", "Input network to parse") do |data|
63
+ options[:input_network] = data
64
+ end
65
+
66
+ options[:output_network] = 'cafa_network.txt'
67
+ opts.on("-o", "--output_network PATH", "Output network without CAFA proteins") do |data|
68
+ options[:output_network] = data
69
+ end
70
+
71
+ opts.on_tail("-h", "--help", "Show tool help") do
72
+ puts opts
73
+ exit
74
+ end
75
+
76
+ end.parse!
77
+
78
+ ##########################
79
+ #MAIN
80
+ ##########################
81
+ cafa_data = load_cafa_data(options[:cafa_data])
82
+ go_gene_rels, domain_gene_rels = load_network_data(options[:input_network])
83
+ genes2gos_layer = check_genes(go_gene_rels, cafa_data)
84
+ genes2domains_layer = check_genes(domain_gene_rels, cafa_data)
85
+ handler = File.open(options[:output_network], 'w')
86
+ genes2gos_layer.each do |gene, go_list|
87
+ go_list.each do |go_term|
88
+ handler.puts "#{go_term}\t#{gene}"
89
+ end
90
+ end
91
+ genes2domains_layer.each do |gene, domains|
92
+ domains.each do |domain|
93
+ handler.puts "#{domain}\t#{gene}"
94
+ end
95
+ end
96
+ handler.close
data/bin/setup ADDED
@@ -0,0 +1,8 @@
1
+ #!/usr/bin/env bash
2
+ set -euo pipefail
3
+ IFS=$'\n\t'
4
+ set -vx
5
+
6
+ bundle install
7
+
8
+ # Do any other automated setup that you need to do here
@@ -0,0 +1,53 @@
1
+ #! /usr/bin/env Rscript
2
+
3
+ library(optparse)
4
+ library(ggplot2)
5
+ #####################
6
+ ## OPTPARSE
7
+ #####################
8
+ option_list <- list(
9
+ make_option(c("-d", "--data_file"), type="character",
10
+ help="Tabulated file with information about each sample"),
11
+ make_option(c("-o", "--output"), type="character", default="results",
12
+ help="Output figure file"),
13
+ make_option(c("-e", "--external_score"), type="double", default=NULL,
14
+ help="Use external score"),
15
+ make_option(c("-s", "--set_column"), type="character", default="",
16
+ help="Name of column to be converted to Z-scores"),
17
+ make_option(c("-p", "--plot_distribution"), action="store_true", default=FALSE,
18
+ help="Print plot distribution")
19
+ )
20
+ opt <- parse_args(OptionParser(option_list=option_list))
21
+
22
+
23
+ ################################################################
24
+ ## MAIN
25
+ ################################################################
26
+
27
+ data <- read.table(opt$data_file, sep="\t", header=FALSE)
28
+ raw_data <- data[[opt$set_column]]
29
+ if(!is.null(opt$external_score)){
30
+ raw_data <- c(opt$external_score, raw_data)
31
+ }
32
+ if(opt$plot_distribution){
33
+ dataframe <- as.data.frame(raw_data)
34
+ colnames(dataframe) <- c("AssociationValue")
35
+ plot <- ggplot(dataframe, aes(y=AssociationValue)) +
36
+ geom_boxplot()
37
+ print(ggplot_build(plot))
38
+ quit(save = "default", status = 0, runLast = TRUE)
39
+ }
40
+ #print(ggplot_build(plot))
41
+
42
+
43
+ #message(mean(raw_data))
44
+ z_scores = scale(raw_data, center=TRUE, scale=TRUE)
45
+ if(!is.null(opt$external_score)){
46
+ external_score2z_score <- z_scores[1]
47
+ cat(sep="","ExtZScore\t",external_score2z_score,"\n")
48
+ z_scores <- z_scores[-1] #remove external score
49
+ }
50
+
51
+ data[[opt$set_column]] <- z_scores
52
+
53
+ write.table(data, file=opt$output, quote=FALSE, sep="\t", row.names=FALSE, col.names=FALSE)
@@ -0,0 +1,98 @@
1
+ #! /usr/bin/env ruby
2
+ ##########################
3
+ # Rojano E. & Seoane P., July 2019
4
+ # Translate KEGG genes into pathways
5
+ ##########################
6
+
7
+ REPORT_FOLDER=File.expand_path(File.join(File.dirname(__FILE__), '..', 'templates'))
8
+ ROOT_PATH = File.dirname(__FILE__)
9
+ $: << File.expand_path(File.join(ROOT_PATH, '..', 'lib', 'DomFun'))
10
+ # require 'generalMethods.rb'
11
+ require 'optparse'
12
+
13
+ ##########################
14
+ #METHODS
15
+ ##########################
16
+
17
+ def load_kegg_dictionary(pathway_to_genes_file)
18
+ kegg_dictionary = {}
19
+ File.open(pathway_to_genes_file).each do |line|
20
+ line.chomp!
21
+ keggGeneID, keggPathwayID = line.split("\t")
22
+ query = kegg_dictionary[keggGeneID]
23
+ if query.nil?
24
+ kegg_dictionary[keggGeneID] = [keggPathwayID]
25
+ else
26
+ query << keggPathwayID
27
+ end
28
+
29
+ end
30
+ return kegg_dictionary
31
+ end
32
+
33
+ def load_network(network_kegg_file)
34
+ network_kegg = []
35
+ superfamily_ids = []
36
+ File.open(network_kegg_file).each do |line|
37
+ line.chomp!
38
+ kegg_gene_ID, gene = line.split("\t")
39
+ if kegg_gene_ID.include?('hsa:')
40
+ network_kegg << [kegg_gene_ID, gene]
41
+ else
42
+ superfamily_ids << [kegg_gene_ID, gene]
43
+ end
44
+ end
45
+ return network_kegg, superfamily_ids
46
+ end
47
+
48
+ ##########################
49
+ #OPT-PARSER
50
+ ##########################
51
+ options = {}
52
+ OptionParser.new do |opts|
53
+ opts.banner = "Usage: #{__FILE__} [options]"
54
+
55
+ options[:kegg_pathways] = nil
56
+ opts.on("-k", "--kegg_pathways PATH", "Input file with KEGG genes to pathways") do |data|
57
+ options[:kegg_pathways] = data
58
+ end
59
+
60
+ options[:network_kegg] = nil
61
+ opts.on("-n", "--network_kegg PATH", "Network with KEGG genes to translate into pathways") do |data|
62
+ options[:network_kegg] = data
63
+ end
64
+
65
+ options[:output_path] = 'network_kegg_pathways'
66
+ opts.on("-o", "--output_path PATH", "Resulting network output path") do |data|
67
+ options[:output_path] = data
68
+ end
69
+
70
+ opts.on_tail("-h", "--help", "Show this message") do
71
+ puts opts
72
+ exit
73
+ end
74
+
75
+ end.parse!
76
+
77
+ ##########################
78
+ #MAIN
79
+ ##########################
80
+ kegg_dictionary = load_kegg_dictionary(options[:kegg_pathways])
81
+ network_kegg, superfamily_ids = load_network(options[:network_kegg])
82
+ pathways_network = []
83
+ network_kegg.each do |kegg_gene_ID, proteinID|
84
+ pathwayIDs = kegg_dictionary[kegg_gene_ID]
85
+ unless pathwayIDs.nil?
86
+ pathwayIDs.each do |pathway|
87
+ pathways_network << [pathway, proteinID]
88
+ end
89
+ end
90
+ end
91
+ handler = File.open(options[:output_path], 'w')
92
+ pathways_network.each do |pair|
93
+ handler.puts pair.join("\t")
94
+ end
95
+ superfamily_ids.each do |pair|
96
+ handler.puts pair.join("\t")
97
+ end
98
+ handler.close
@@ -0,0 +1,174 @@
1
+ #! /usr/bin/env ruby
2
+ ##########################
3
+ # Rojano E. & Seoane P., June 2019
4
+ # Domain to functional annotation predictor validation system
5
+ # The script uses the predictions file and proteins-FunSys from UniProtKB
6
+ # It compares the predictions with the proteins-FunSys to validate the functioning of the predictor
7
+ # Generate values to plot in a PR
8
+ ##########################
9
+
10
+
11
+ REPORT_FOLDER=File.expand_path(File.join(File.dirname(__FILE__), '..', 'templates'))
12
+ ROOT_PATH = File.dirname(__FILE__)
13
+ $: << File.expand_path(File.join(ROOT_PATH, '..', 'lib', 'DomFun'))
14
+ require 'generalMethods.rb'
15
+ require 'optparse'
16
+ require "statistics2"
17
+ require 'bigdecimal'
18
+
19
+
20
+ ##########################
21
+ #METHODS
22
+ ##########################
23
+
24
+ def load_predictions_file(predictions_file)
25
+ predictions = []
26
+ File.open(predictions_file).each do |line|
27
+ line.chomp!
28
+ next if line.include?('ProteinID')
29
+ protein, domains, funSys, combinedScore = line.split("\t")
30
+ predictions << [protein, funSys, combinedScore.to_f]
31
+ end
32
+ return predictions
33
+ end
34
+
35
+ def load_control_file(control_file)
36
+ control_protein_FunSys = {}
37
+ File.open(control_file).each do |line|
38
+ line.chomp!
39
+ proteinID, funSys = line.split("\t")
40
+ control_protein_FunSys[proteinID] = funSys.split(';')
41
+ end
42
+ return control_protein_FunSys
43
+ end
44
+
45
+ def load_prediction(pairs_array)
46
+ pred = {}
47
+ min = nil
48
+ max = nil
49
+ pairs_array.each do |key, label, score| #protein, FunSys, assocScore
50
+ query = pred[key]
51
+ if !min.nil? && !max.nil?
52
+ min = score if score < min
53
+ max = score if score > max
54
+ else
55
+ min = score; max = score
56
+ end
57
+ if query.nil?
58
+ pred[key] = [[label], [score]]
59
+ else
60
+ query.first << label
61
+ query.last << score
62
+ end
63
+ end
64
+ return pred, [min, max]
65
+ end
66
+
67
+
68
+ # Pandey 2007, Association Analysis-based Transformations for Protein Interaction Networks: A Function Prediction Case Study
69
+ def get_pred_rec(meth, cut_number = 100, top_number = 10000, control_protein_FunSys, predictions)
70
+ performance = [] #cut, pred, rec
71
+ preds, limits = load_prediction(predictions)
72
+ cuts = get_cuts(limits, cut_number)
73
+ cuts.each do |cut|
74
+ prec, rec = pred_rec(preds, cut, top_number, control_protein_FunSys)
75
+ performance << [cut, prec, rec]
76
+ end
77
+ return performance
78
+ end
79
+
80
+ def pred_rec(preds, cut, top, control_protein_FunSys)
81
+ predicted_labels = 0 #m
82
+ true_labels = 0 #n
83
+ common_labels = 0 # k
84
+ control_protein_FunSys.each do |key, c_labels|
85
+ true_labels += c_labels.length #n
86
+ pred_info = preds[key]
87
+ if !pred_info.nil?
88
+ labels, scores = pred_info
89
+ reliable_labels = get_reliable_labels(labels, scores, cut, top)
90
+ predicted_labels += reliable_labels.length #m
91
+ common_labels += (c_labels & reliable_labels).length #k
92
+ end
93
+ end
94
+ #puts "cut: #{cut} trueL: #{true_labels} predL: #{predicted_labels} commL: #{common_labels}"
95
+ prec = common_labels.to_f/predicted_labels
96
+ rec = common_labels.to_f/true_labels
97
+ prec = 0.0 if prec.nan?
98
+ rec = 0.0 if rec.nan?
99
+ return prec, rec
100
+ end
101
+
102
+ def get_cuts(limits, n_cuts)
103
+ cuts = []
104
+ range = (limits.last - limits.first).abs.fdiv(n_cuts)
105
+ range = BigDecimal(range, 10)
106
+ cut = limits.first
107
+ (n_cuts + 1).times do |n|
108
+ cuts << (cut + n * range).to_f
109
+ end
110
+ return cuts
111
+ end
112
+
113
+ def get_reliable_labels(labels, scores, cut, top)
114
+ reliable_labels = []
115
+ scores.each_with_index do |score, i|
116
+ reliable_labels << [labels[i], score] if score >= cut
117
+ end
118
+ reliable_labels = reliable_labels.sort!{|l1,l2| l2.last <=> l1.last}[0..top-1].map{|pred| pred.first}
119
+ return reliable_labels
120
+ end
121
+
122
+
123
+ ##########################
124
+ #OPT-PARSER
125
+ ##########################
126
+
127
+ options = {}
128
+ OptionParser.new do |opts|
129
+ opts.banner = "Usage: #{__FILE__} [options]"
130
+
131
+ options[:input_predictions] = nil
132
+ opts.on("-a", "--input_predictions PATH", "Domain-function predictions") do |data|
133
+ options[:input_predictions] = data
134
+ end
135
+
136
+ options[:control_file] = nil
137
+ opts.on("-c", "--control_file PATH", "Control dataset with proteins-FunSys from UniProtKB") do |data|
138
+ options[:control_file] = data
139
+ end
140
+
141
+ options[:assoc_meth] = nil
142
+ opts.on("-m", "--assoc_meth STRING", "Association method used") do |data|
143
+ options[:assoc_meth] = data
144
+ end
145
+
146
+ options[:performance_file] = 'precision_recall.txt'
147
+ opts.on("-p", "--performance_file PATH", "Output file with PR values") do |data|
148
+ options[:performance_file] = data
149
+ end
150
+
151
+ opts.on_tail("-h", "--help", "Show this message") do
152
+ puts opts
153
+ exit
154
+ end
155
+
156
+ end.parse!
157
+
158
+ ##########################
159
+ #MAIN
160
+ ##########################
161
+
162
+ control_protein_FunSys = load_control_file(options[:control_file])
163
+
164
+ domains_FunSys_predictions = load_predictions_file(options[:input_predictions])
165
+
166
+ performance = get_pred_rec(options[:assoc_meth], 100, 10000, control_protein_FunSys, domains_FunSys_predictions)
167
+
168
+ File.open(options[:performance_file], 'w') do |f|
169
+ f.puts %w[cut prec rec meth].join("\t")
170
+ performance.each do |item|
171
+ item << options[:assoc_meth].to_s
172
+ f.puts item.join("\t")
173
+ end
174
+ end