DomFun 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,118 @@
1
+ #! /usr/bin/env ruby
2
+ ##########################
3
+ # Rojano E. & Seoane P., July 2019
4
+ # Normalize combined scores for their use with CAFA validation
5
+ ##########################
6
+
7
+ require 'optparse'
8
+
9
+ ##########################
10
+ #METHODS
11
+ ##########################
12
+ module Enumerable
13
+
14
+ def sum
15
+ self.inject(0){|accum, i| accum + i }
16
+ end
17
+
18
+ def mean
19
+ self.sum/self.length.to_f
20
+ end
21
+
22
+ def sample_variance
23
+ m = self.mean
24
+ sum = self.inject(0){|accum, i| accum +(i-m)**2 }
25
+ sum/(self.length - 1).to_f
26
+ end
27
+
28
+ def standard_deviation
29
+ Math.sqrt(self.sample_variance)
30
+ end
31
+
32
+ end
33
+
34
+ def load_predictions(input_file)
35
+ predictions_data = []
36
+ File.open(input_file).each do |line|
37
+ line.chomp!
38
+ protID, domains, funSys, combScore = line.split("\t")
39
+ predictions_data << [protID, domains, funSys, combScore.to_f]
40
+ end
41
+ return predictions_data
42
+ end
43
+
44
+ ##########################
45
+ #OPT-PARSER
46
+ ##########################
47
+ options = {}
48
+ OptionParser.new do |opts|
49
+ opts.banner = "Usage: #{__FILE__} [options]"
50
+
51
+ options[:input_file] = nil
52
+ opts.on("-a", "--input_file PATH", "Input file with association values to normalize") do |data|
53
+ options[:input_file] = data
54
+ end
55
+
56
+ options[:integration_method] = 'fisher'
57
+ opts.on("-i", "--integration_method STRING", "Integration method") do |data|
58
+ options[:integration_method] = data
59
+ end
60
+
61
+ options[:normalization_mode] = 'normal'
62
+ opts.on("-m", "--normalization_mode STRING", "Normalization mode: normal or max") do |data|
63
+ options[:normalization_mode] = data
64
+ end
65
+
66
+ options[:output_file] = 'normalized_associations.txt'
67
+ opts.on("-o", "--output_file PATH", "Output association file with normalized values") do |data|
68
+ options[:output_file] = data
69
+ end
70
+
71
+ opts.on_tail("-h", "--help", "Tool information") do
72
+ puts opts
73
+ exit
74
+ end
75
+
76
+ end.parse!
77
+
78
+ ##########################
79
+ #MAIN
80
+ ##########################
81
+ predictions_data = load_predictions(options[:input_file])
82
+ standardized_data = []
83
+
84
+ if options[:integration_method] == 'fisher'
85
+ predictions_data.each do |protID, domains, funSys, combScore|
86
+ stdScore = 1 - combScore
87
+ #CAFA validation score must be in range (0,1]
88
+ #Remember that fisher's integration most significant value is 0
89
+ standardized_data << [protID, domains, funSys, stdScore] if stdScore > 0.001
90
+ end
91
+ else
92
+ #https://www.researchgate.net/post/How_do_i_normalize_data_from_0_to_1_range
93
+ combScores = predictions_data.map{|a| a[3] }
94
+ combScoresAverage = combScores.mean
95
+ combScoresSD = combScores.standard_deviation
96
+ maxCombScore = combScores.max
97
+ predictions_data.each do |protID, domains, funSys, combScore|
98
+ if options[:normalization_mode] == 'normal'
99
+ score = (combScore - combScoresAverage).fdiv(combScoresSD)
100
+ if score > 2
101
+ score = 2
102
+ elsif score < -2
103
+ score = -2
104
+ end
105
+ stdScore = score.fdiv(4) + 0.5
106
+ elsif options[:normalization_mode] == 'max'
107
+ stdScore = combScore.fdiv(maxCombScore)
108
+ end
109
+ #CAFA validation score must be in range (0,1]
110
+ standardized_data << [protID, domains, funSys, stdScore] #if stdScore > 0.001
111
+ end
112
+ end
113
+
114
+ handler = File.open(options[:output_file], 'w')
115
+ standardized_data.each do |data|
116
+ handler.puts data.join("\t")
117
+ end
118
+ handler.close
@@ -0,0 +1,96 @@
1
+ #! /usr/bin/env ruby
2
+ ##########################
3
+ # Rojano E. & Seoane P., July 2019
4
+ # Generate GO tripartite networks filtered with CAFA data
5
+ # For its use as system control (only GO)
6
+ ##########################
7
+
8
+ REPORT_FOLDER=File.expand_path(File.join(File.dirname(__FILE__), '..', 'templates'))
9
+ ROOT_PATH = File.dirname(__FILE__)
10
+ $: << File.expand_path(File.join(ROOT_PATH, '..', 'lib', 'DomFun'))
11
+ require 'optparse'
12
+ require 'generalMethods.rb'
13
+
14
+ ##########################
15
+ #METHODS
16
+ ##########################
17
+
18
+ def load_network_data(network)
19
+ go_gene_rels = {}
20
+ domain_gene_rels = {}
21
+ File.open(network).each do |line|
22
+ line.chomp!
23
+ term, gene = line.split("\t")
24
+ if term.include?('GO:')
25
+ query = go_gene_rels[gene]
26
+ if query.nil?
27
+ go_gene_rels[gene] = [term]
28
+ else
29
+ query << term
30
+ end
31
+ else
32
+ query = domain_gene_rels[gene]
33
+ if query.nil?
34
+ domain_gene_rels[gene] = [term]
35
+ else
36
+ query << term
37
+ end
38
+ end
39
+ end
40
+ return go_gene_rels, domain_gene_rels
41
+ end
42
+
43
+ def check_genes(term_protein_rels, cafa_data)
44
+ cafa_data.each do |gene, go_list|
45
+ term_protein_rels.delete(gene)
46
+ end
47
+ return term_protein_rels
48
+ end
49
+ ##########################
50
+ #OPT-PARSER
51
+ ##########################
52
+ options = {}
53
+ OptionParser.new do |opts|
54
+ opts.banner = "Usage: #{__FILE__} [options]"
55
+
56
+ options[:cafa_data] = nil
57
+ opts.on("-a", "--cafa_data PATH", "Input CAFA gene annotations") do |data|
58
+ options[:cafa_data] = data
59
+ end
60
+
61
+ options[:input_network] = nil
62
+ opts.on("-n", "--input_network PATH", "Input network to parse") do |data|
63
+ options[:input_network] = data
64
+ end
65
+
66
+ options[:output_network] = 'cafa_network.txt'
67
+ opts.on("-o", "--output_network PATH", "Output network without CAFA proteins") do |data|
68
+ options[:output_network] = data
69
+ end
70
+
71
+ opts.on_tail("-h", "--help", "Show tool help") do
72
+ puts opts
73
+ exit
74
+ end
75
+
76
+ end.parse!
77
+
78
+ ##########################
79
+ #MAIN
80
+ ##########################
81
+ cafa_data = load_cafa_data(options[:cafa_data])
82
+ go_gene_rels, domain_gene_rels = load_network_data(options[:input_network])
83
+ genes2gos_layer = check_genes(go_gene_rels, cafa_data)
84
+ genes2domains_layer = check_genes(domain_gene_rels, cafa_data)
85
+ handler = File.open(options[:output_network], 'w')
86
+ genes2gos_layer.each do |gene, go_list|
87
+ go_list.each do |go_term|
88
+ handler.puts "#{go_term}\t#{gene}"
89
+ end
90
+ end
91
+ genes2domains_layer.each do |gene, domains|
92
+ domains.each do |domain|
93
+ handler.puts "#{domain}\t#{gene}"
94
+ end
95
+ end
96
+ handler.close
data/bin/setup ADDED
@@ -0,0 +1,8 @@
1
+ #!/usr/bin/env bash
2
+ set -euo pipefail
3
+ IFS=$'\n\t'
4
+ set -vx
5
+
6
+ bundle install
7
+
8
+ # Do any other automated setup that you need to do here
@@ -0,0 +1,53 @@
1
+ #! /usr/bin/env Rscript
2
+
3
+ library(optparse)
4
+ library(ggplot2)
5
+ #####################
6
+ ## OPTPARSE
7
+ #####################
8
+ option_list <- list(
9
+ make_option(c("-d", "--data_file"), type="character",
10
+ help="Tabulated file with information about each sample"),
11
+ make_option(c("-o", "--output"), type="character", default="results",
12
+ help="Output figure file"),
13
+ make_option(c("-e", "--external_score"), type="double", default=NULL,
14
+ help="Use external score"),
15
+ make_option(c("-s", "--set_column"), type="character", default="",
16
+ help="Name of column to be converted to Z-scores"),
17
+ make_option(c("-p", "--plot_distribution"), action="store_true", default=FALSE,
18
+ help="Print plot distribution")
19
+ )
20
+ opt <- parse_args(OptionParser(option_list=option_list))
21
+
22
+
23
+ ################################################################
24
+ ## MAIN
25
+ ################################################################
26
+
27
+ data <- read.table(opt$data_file, sep="\t", header=FALSE)
28
+ raw_data <- data[[opt$set_column]]
29
+ if(!is.null(opt$external_score)){
30
+ raw_data <- c(opt$external_score, raw_data)
31
+ }
32
+ if(opt$plot_distribution){
33
+ dataframe <- as.data.frame(raw_data)
34
+ colnames(dataframe) <- c("AssociationValue")
35
+ plot <- ggplot(dataframe, aes(y=AssociationValue)) +
36
+ geom_boxplot()
37
+ print(ggplot_build(plot))
38
+ quit(save = "default", status = 0, runLast = TRUE)
39
+ }
40
+ #print(ggplot_build(plot))
41
+
42
+
43
+ #message(mean(raw_data))
44
+ z_scores = scale(raw_data, center=TRUE, scale=TRUE)
45
+ if(!is.null(opt$external_score)){
46
+ external_score2z_score <- z_scores[1]
47
+ cat(sep="","ExtZScore\t",external_score2z_score,"\n")
48
+ z_scores <- z_scores[-1] #remove external score
49
+ }
50
+
51
+ data[[opt$set_column]] <- z_scores
52
+
53
+ write.table(data, file=opt$output, quote=FALSE, sep="\t", row.names=FALSE, col.names=FALSE)
@@ -0,0 +1,98 @@
1
+ #! /usr/bin/env ruby
2
+ ##########################
3
+ # Rojano E. & Seoane P., July 2019
4
+ # Translate KEGG genes into pathways
5
+ ##########################
6
+
7
+ REPORT_FOLDER=File.expand_path(File.join(File.dirname(__FILE__), '..', 'templates'))
8
+ ROOT_PATH = File.dirname(__FILE__)
9
+ $: << File.expand_path(File.join(ROOT_PATH, '..', 'lib', 'DomFun'))
10
+ # require 'generalMethods.rb'
11
+ require 'optparse'
12
+
13
+ ##########################
14
+ #METHODS
15
+ ##########################
16
+
17
+ def load_kegg_dictionary(pathway_to_genes_file)
18
+ kegg_dictionary = {}
19
+ File.open(pathway_to_genes_file).each do |line|
20
+ line.chomp!
21
+ keggGeneID, keggPathwayID = line.split("\t")
22
+ query = kegg_dictionary[keggGeneID]
23
+ if query.nil?
24
+ kegg_dictionary[keggGeneID] = [keggPathwayID]
25
+ else
26
+ query << keggPathwayID
27
+ end
28
+
29
+ end
30
+ return kegg_dictionary
31
+ end
32
+
33
+ def load_network(network_kegg_file)
34
+ network_kegg = []
35
+ superfamily_ids = []
36
+ File.open(network_kegg_file).each do |line|
37
+ line.chomp!
38
+ kegg_gene_ID, gene = line.split("\t")
39
+ if kegg_gene_ID.include?('hsa:')
40
+ network_kegg << [kegg_gene_ID, gene]
41
+ else
42
+ superfamily_ids << [kegg_gene_ID, gene]
43
+ end
44
+ end
45
+ return network_kegg, superfamily_ids
46
+ end
47
+
48
+ ##########################
49
+ #OPT-PARSER
50
+ ##########################
51
+ options = {}
52
+ OptionParser.new do |opts|
53
+ opts.banner = "Usage: #{__FILE__} [options]"
54
+
55
+ options[:kegg_pathways] = nil
56
+ opts.on("-k", "--kegg_pathways PATH", "Input file with KEGG genes to pathways") do |data|
57
+ options[:kegg_pathways] = data
58
+ end
59
+
60
+ options[:network_kegg] = nil
61
+ opts.on("-n", "--network_kegg PATH", "Network with KEGG genes to translate into pathways") do |data|
62
+ options[:network_kegg] = data
63
+ end
64
+
65
+ options[:output_path] = 'network_kegg_pathways'
66
+ opts.on("-o", "--output_path PATH", "Resulting network output path") do |data|
67
+ options[:output_path] = data
68
+ end
69
+
70
+ opts.on_tail("-h", "--help", "Show this message") do
71
+ puts opts
72
+ exit
73
+ end
74
+
75
+ end.parse!
76
+
77
+ ##########################
78
+ #MAIN
79
+ ##########################
80
+ kegg_dictionary = load_kegg_dictionary(options[:kegg_pathways])
81
+ network_kegg, superfamily_ids = load_network(options[:network_kegg])
82
+ pathways_network = []
83
+ network_kegg.each do |kegg_gene_ID, proteinID|
84
+ pathwayIDs = kegg_dictionary[kegg_gene_ID]
85
+ unless pathwayIDs.nil?
86
+ pathwayIDs.each do |pathway|
87
+ pathways_network << [pathway, proteinID]
88
+ end
89
+ end
90
+ end
91
+ handler = File.open(options[:output_path], 'w')
92
+ pathways_network.each do |pair|
93
+ handler.puts pair.join("\t")
94
+ end
95
+ superfamily_ids.each do |pair|
96
+ handler.puts pair.join("\t")
97
+ end
98
+ handler.close
@@ -0,0 +1,174 @@
1
+ #! /usr/bin/env ruby
2
+ ##########################
3
+ # Rojano E. & Seoane P., June 2019
4
+ # Domain to functional annotation predictor validation system
5
+ # The script uses the predictions file and proteins-FunSys from UniProtKB
6
+ # It compares the predictions with the proteins-FunSys to validate the functioning of the predictor
7
+ # Generate values to plot in a PR
8
+ ##########################
9
+
10
+
11
+ REPORT_FOLDER=File.expand_path(File.join(File.dirname(__FILE__), '..', 'templates'))
12
+ ROOT_PATH = File.dirname(__FILE__)
13
+ $: << File.expand_path(File.join(ROOT_PATH, '..', 'lib', 'DomFun'))
14
+ require 'generalMethods.rb'
15
+ require 'optparse'
16
+ require "statistics2"
17
+ require 'bigdecimal'
18
+
19
+
20
+ ##########################
21
+ #METHODS
22
+ ##########################
23
+
24
+ def load_predictions_file(predictions_file)
25
+ predictions = []
26
+ File.open(predictions_file).each do |line|
27
+ line.chomp!
28
+ next if line.include?('ProteinID')
29
+ protein, domains, funSys, combinedScore = line.split("\t")
30
+ predictions << [protein, funSys, combinedScore.to_f]
31
+ end
32
+ return predictions
33
+ end
34
+
35
+ def load_control_file(control_file)
36
+ control_protein_FunSys = {}
37
+ File.open(control_file).each do |line|
38
+ line.chomp!
39
+ proteinID, funSys = line.split("\t")
40
+ control_protein_FunSys[proteinID] = funSys.split(';')
41
+ end
42
+ return control_protein_FunSys
43
+ end
44
+
45
+ def load_prediction(pairs_array)
46
+ pred = {}
47
+ min = nil
48
+ max = nil
49
+ pairs_array.each do |key, label, score| #protein, FunSys, assocScore
50
+ query = pred[key]
51
+ if !min.nil? && !max.nil?
52
+ min = score if score < min
53
+ max = score if score > max
54
+ else
55
+ min = score; max = score
56
+ end
57
+ if query.nil?
58
+ pred[key] = [[label], [score]]
59
+ else
60
+ query.first << label
61
+ query.last << score
62
+ end
63
+ end
64
+ return pred, [min, max]
65
+ end
66
+
67
+
68
+ # Pandey 2007, Association Analysis-based Transformations for Protein Interaction Networks: A Function Prediction Case Study
69
+ def get_pred_rec(meth, cut_number = 100, top_number = 10000, control_protein_FunSys, predictions)
70
+ performance = [] #cut, pred, rec
71
+ preds, limits = load_prediction(predictions)
72
+ cuts = get_cuts(limits, cut_number)
73
+ cuts.each do |cut|
74
+ prec, rec = pred_rec(preds, cut, top_number, control_protein_FunSys)
75
+ performance << [cut, prec, rec]
76
+ end
77
+ return performance
78
+ end
79
+
80
+ def pred_rec(preds, cut, top, control_protein_FunSys)
81
+ predicted_labels = 0 #m
82
+ true_labels = 0 #n
83
+ common_labels = 0 # k
84
+ control_protein_FunSys.each do |key, c_labels|
85
+ true_labels += c_labels.length #n
86
+ pred_info = preds[key]
87
+ if !pred_info.nil?
88
+ labels, scores = pred_info
89
+ reliable_labels = get_reliable_labels(labels, scores, cut, top)
90
+ predicted_labels += reliable_labels.length #m
91
+ common_labels += (c_labels & reliable_labels).length #k
92
+ end
93
+ end
94
+ #puts "cut: #{cut} trueL: #{true_labels} predL: #{predicted_labels} commL: #{common_labels}"
95
+ prec = common_labels.to_f/predicted_labels
96
+ rec = common_labels.to_f/true_labels
97
+ prec = 0.0 if prec.nan?
98
+ rec = 0.0 if rec.nan?
99
+ return prec, rec
100
+ end
101
+
102
+ def get_cuts(limits, n_cuts)
103
+ cuts = []
104
+ range = (limits.last - limits.first).abs.fdiv(n_cuts)
105
+ range = BigDecimal(range, 10)
106
+ cut = limits.first
107
+ (n_cuts + 1).times do |n|
108
+ cuts << (cut + n * range).to_f
109
+ end
110
+ return cuts
111
+ end
112
+
113
+ def get_reliable_labels(labels, scores, cut, top)
114
+ reliable_labels = []
115
+ scores.each_with_index do |score, i|
116
+ reliable_labels << [labels[i], score] if score >= cut
117
+ end
118
+ reliable_labels = reliable_labels.sort!{|l1,l2| l2.last <=> l1.last}[0..top-1].map{|pred| pred.first}
119
+ return reliable_labels
120
+ end
121
+
122
+
123
+ ##########################
124
+ #OPT-PARSER
125
+ ##########################
126
+
127
+ options = {}
128
+ OptionParser.new do |opts|
129
+ opts.banner = "Usage: #{__FILE__} [options]"
130
+
131
+ options[:input_predictions] = nil
132
+ opts.on("-a", "--input_predictions PATH", "Domain-function predictions") do |data|
133
+ options[:input_predictions] = data
134
+ end
135
+
136
+ options[:control_file] = nil
137
+ opts.on("-c", "--control_file PATH", "Control dataset with proteins-FunSys from UniProtKB") do |data|
138
+ options[:control_file] = data
139
+ end
140
+
141
+ options[:assoc_meth] = nil
142
+ opts.on("-m", "--assoc_meth STRING", "Association method used") do |data|
143
+ options[:assoc_meth] = data
144
+ end
145
+
146
+ options[:performance_file] = 'precision_recall.txt'
147
+ opts.on("-p", "--performance_file PATH", "Output file with PR values") do |data|
148
+ options[:performance_file] = data
149
+ end
150
+
151
+ opts.on_tail("-h", "--help", "Show this message") do
152
+ puts opts
153
+ exit
154
+ end
155
+
156
+ end.parse!
157
+
158
+ ##########################
159
+ #MAIN
160
+ ##########################
161
+
162
+ control_protein_FunSys = load_control_file(options[:control_file])
163
+
164
+ domains_FunSys_predictions = load_predictions_file(options[:input_predictions])
165
+
166
+ performance = get_pred_rec(options[:assoc_meth], 100, 10000, control_protein_FunSys, domains_FunSys_predictions)
167
+
168
+ File.open(options[:performance_file], 'w') do |f|
169
+ f.puts %w[cut prec rec meth].join("\t")
170
+ performance.each do |item|
171
+ item << options[:assoc_meth].to_s
172
+ f.puts item.join("\t")
173
+ end
174
+ end