DomFun 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,135 @@
1
+ #! /usr/bin/env ruby
2
+
3
+ require 'optparse'
4
+
5
+ ##########################
6
+ #MODULES
7
+ #########################
8
+
9
+ module Enumerable
10
+
11
+ def sum
12
+ self.inject(0){|accum, i| accum + i }
13
+ end
14
+
15
+ def mean
16
+ self.sum/self.length.to_f
17
+ end
18
+
19
+ def sample_variance
20
+ m = self.mean
21
+ sum = self.inject(0){|accum, i| accum +(i-m)**2 }
22
+ sum/(self.length - 1).to_f
23
+ end
24
+
25
+ def standard_deviation
26
+ Math.sqrt(self.sample_variance)
27
+ end
28
+
29
+ end
30
+
31
+ ##########################
32
+ #METHODS
33
+ #########################
34
+ # Corregir el input de combined scores
35
+
36
+ def load_predictions(input_file)
37
+ predictions = {}
38
+ File.open(input_file).each do |line|
39
+ line.chomp!
40
+ proteinID, domains, go_term, p_value = line.split("\t")
41
+ query = predictions[proteinID]
42
+ if query.nil?
43
+ predictions[proteinID] = [[go_term, p_value.to_f]]
44
+ else
45
+ query << [go_term, p_value.to_f]
46
+ end
47
+ end
48
+ return predictions
49
+ end
50
+
51
+ def translate_protein_ids(cafa_file, predictions)
52
+ cafa_predictions = []
53
+ File.open(cafa_file).each do |line|
54
+ line.chomp!
55
+ cafa_data = line.split("\t")
56
+ cafaID = cafa_data[3]
57
+ proteinID = cafa_data[6]
58
+ go_associations = predictions[proteinID]
59
+ unless go_associations.nil?
60
+ go_associations.each do |goID, value|
61
+ cafa_predictions << [cafaID, goID, value]
62
+ end
63
+ end
64
+ end
65
+ return cafa_predictions
66
+ end
67
+
68
+ def normalize_association_values(cafa_predictions)
69
+ raw_association_values = []
70
+ normalized_prediction_values = []
71
+ cafa_predictions.each do |protID, goTerm, value|
72
+ raw_association_values << value.to_f
73
+ end
74
+ raw_values_mean = raw_association_values.mean
75
+ raw_values_sd = raw_association_values.standard_deviation
76
+ association_values_minus_mean = []
77
+ raw_association_values.each do |value|
78
+ z_score = (value - raw_values_mean).fdiv(raw_values_sd)
79
+ if z_score > 2
80
+ z_score = 2
81
+ elsif z_score < -2
82
+ z_score = -2
83
+ end
84
+ z_score = z_score.fdiv(5) + 0.5
85
+ association_values_minus_mean << z_score
86
+ end
87
+ cafa_predictions.each_with_index do |info, c|
88
+ protID, goTerm, association_values = info
89
+ normalized_prediction_values << [protID, goTerm, association_values_minus_mean[c]]
90
+ end
91
+ cafa_predictions = normalized_prediction_values
92
+ return cafa_predictions
93
+ end
94
+
95
+ ##########################
96
+ #OPT-PARSER
97
+ ##########################
98
+ options = {}
99
+ OptionParser.new do |opts|
100
+ opts.banner = "Usage: #{__FILE__} [options]"
101
+
102
+ options[:input_predictions] = nil
103
+ opts.on("-a", "--input_predictions PATH", "Input predictions file") do |input_predictions|
104
+ options[:input_predictions] = input_predictions
105
+ end
106
+
107
+ options[:input_cafa] = nil
108
+ opts.on("-c", "--input_cafa PATH", "Input CAFA file to translate UniProtIDs to CAFAIDs") do |input_cafa|
109
+ options[:input_cafa] = input_cafa
110
+ end
111
+
112
+ options[:output_file] = 'results_to_CAFA2_validation.txt'
113
+ opts.on("-o", "--output_file PATH", "Output file") do |output_file|
114
+ options[:output_file] = output_file
115
+ end
116
+
117
+ options[:do_norm] = false
118
+ opts.on("-n", "--do_norm", "Normalize prediction values") do
119
+ options[:do_norm] = true
120
+ end
121
+
122
+ end.parse!
123
+
124
+ ##########################
125
+ #MAIN
126
+ ##########################
127
+
128
+ predictions = load_predictions(options[:input_predictions])
129
+ cafa_predictions = translate_protein_ids(options[:input_cafa], predictions)
130
+ cafa_predictions = normalize_association_values(cafa_predictions) if options[:do_norm]
131
+
132
+ handler = File.open(options[:output_file], 'w')
133
+ cafa_predictions.each do |info|
134
+ handler.puts "#{info.join("\t")}"
135
+ end
@@ -0,0 +1,139 @@
1
+ #! /usr/bin/env ruby
2
+ ##########################
3
+ # Rojano E. & Seoane P., July 2019
4
+ # Generate training and testing datasets from CAFA2 and UniProt data.
5
+ # This script translate identifiers and generate the file for the
6
+ # construction of tripartite network (domain-protein-FunSys)
7
+ ##########################
8
+
9
+ REPORT_FOLDER=File.expand_path(File.join(File.dirname(__FILE__), '..', 'templates'))
10
+ ROOT_PATH = File.dirname(__FILE__)
11
+ $: << File.expand_path(File.join(ROOT_PATH, '..', 'lib', 'DomFun'))
12
+ require 'generalMethods.rb'
13
+ require 'optparse'
14
+ require 'csv'
15
+
16
+ ##########################
17
+ #METHODS
18
+ ##########################
19
+
20
+ def load_hash(filename, mode)
21
+ container = {}
22
+ File.open(filename).each do |line|
23
+ line.chomp!
24
+ key, value = line.split("\t") if mode == 'a'
25
+ value, key = line.split("\t") if mode == 'b'
26
+ query = container[key]
27
+ if query.nil?
28
+ container[key] = [value]
29
+ else
30
+ query << value
31
+ end
32
+ end
33
+ return container
34
+ end
35
+
36
+ def build_tripartite_network(domain_tuples, annot_tuples, filename)
37
+ handler = File.open(filename, 'w')
38
+ annot_tuples.each do |protID, annots|
39
+ annots.each do |a|
40
+ handler.puts "#{a}\t#{protID}"
41
+ end
42
+ end
43
+ domain_tuples.each do |protID, doms|
44
+ doms.each do |d|
45
+ handler.puts "#{d}\t#{protID}"
46
+ end
47
+ end
48
+ handler.close
49
+ end
50
+
51
+ def load_cath_info(cath_file, domain_class)
52
+ proteins_2_domains = {}
53
+ File.open(cath_file).each do |line|
54
+ line.chomp!
55
+ protID, superfam, funfam = line.split("\t")
56
+ query = proteins_2_domains[protID]
57
+ if domain_class == 'superfamilyID'
58
+ domain_type = superfam
59
+ else
60
+ domain_type = funfam
61
+ end
62
+ if query.nil?
63
+ proteins_2_domains[protID] = [domain_type]
64
+ else
65
+ query << domain_type
66
+ end
67
+
68
+ end
69
+ return proteins_2_domains
70
+ end
71
+
72
+ def generate_tuples(prot_annots, prot_domains)
73
+ domain_tuples = {}
74
+ annot_tuples = {}
75
+ prot_annots.each do |protID, annots|
76
+ domains = prot_domains[protID]
77
+ unless domains.nil?
78
+ domains.each do |d|
79
+ query = domain_tuples[protID]
80
+ if query.nil?
81
+ domain_tuples[protID] = [d]
82
+ else
83
+ query << d unless query.include?(d)
84
+ end
85
+ end
86
+ end
87
+ annots.each do |a|
88
+ query = annot_tuples[protID]
89
+ if query.nil?
90
+ annot_tuples[protID] = [a]
91
+ else
92
+ query << a unless query.include?(a)
93
+ end
94
+ end
95
+ end
96
+ return domain_tuples, annot_tuples
97
+ end
98
+
99
+ ##########################
100
+ #OPT-PARSER
101
+ ##########################
102
+ options = {}
103
+ OptionParser.new do |opts|
104
+ opts.banner = "Usage: #{__FILE__} [options]"
105
+
106
+ options[:protein_domains] = nil
107
+ opts.on("-a", "--protein_domains PATH", "Training proteins with CATH domains") do |data|
108
+ options[:protein_domains] = data
109
+ end
110
+
111
+ options[:annotated_proteins] = nil
112
+ opts.on("-b", "--annotated_proteins PATH", "Training proteins with annotations") do |data|
113
+ options[:annotated_proteins] = data
114
+ end
115
+
116
+ options[:domain_class] = 'funfamID'
117
+ opts.on("-d", "--domain_class STRING", "Domain identifiers type. Please choose funfamID or superfamilyID") do |data|
118
+ options[:domain_class] = data
119
+ end
120
+
121
+ options[:output_network] = 'tripartite_network.txt'
122
+ opts.on("-o", "--output_network PATH", "Output tripartite network from CAFA information") do |data|
123
+ options[:output_network] = data
124
+ end
125
+
126
+ opts.on_tail("-h", "--help", "Tool information") do
127
+ puts opts
128
+ exit
129
+ end
130
+
131
+ end.parse!
132
+
133
+ ##########################
134
+ #MAIN
135
+ ##########################
136
+ proteins_with_annotations = load_hash(options[:annotated_proteins], 'a')
137
+ proteins_2_domains = load_cath_info(options[:protein_domains], options[:domain_class])
138
+ domain_tuples, annot_tuples = generate_tuples(proteins_with_annotations, proteins_2_domains)
139
+ build_tripartite_network(domain_tuples, annot_tuples, options[:output_network])
@@ -0,0 +1,45 @@
1
+ #! /usr/bin/env ruby
2
+ ##########################
3
+ # Rojano E. & Seoane P., July 2019
4
+ # Generate control from CAFA data (proteins and GO terms)
5
+ ##########################
6
+
7
+ REPORT_FOLDER=File.expand_path(File.join(File.dirname(__FILE__), '..', 'templates'))
8
+ ROOT_PATH = File.dirname(__FILE__)
9
+ $: << File.expand_path(File.join(ROOT_PATH, '..', 'lib', 'DomFun'))
10
+ require 'optparse'
11
+ require 'generalMethods.rb'
12
+
13
+ ##########################
14
+ #OPT-PARSER
15
+ ##########################
16
+ options = {}
17
+ OptionParser.new do |opts|
18
+ opts.banner = "Usage: #{__FILE__} [options]"
19
+
20
+ options[:input_cafa] = nil
21
+ opts.on("-a", "--input_cafa PATH", "Input CAFA file") do |data|
22
+ options[:input_cafa] = data
23
+ end
24
+
25
+ options[:output_file] = 'cafa_control.txt'
26
+ opts.on("-o", "--output_file PATH", "Output control from CAFA data") do |data|
27
+ options[:output_file] = data
28
+ end
29
+
30
+ opts.on_tail("-h", "--help", "Tool information") do
31
+ puts opts
32
+ exit
33
+ end
34
+
35
+ end.parse!
36
+
37
+ ##########################
38
+ #MAIN
39
+ ##########################
40
+ cafa_data = load_cafa_data(options[:input_cafa])
41
+ handler = File.open(options[:output_file], 'w')
42
+ cafa_data.each do |protein, goTerms|
43
+ handler.puts "#{protein}\t#{goTerms.join(';')}"
44
+ end
45
+ handler.close
@@ -0,0 +1,12 @@
1
+ #! /usr/bin/env Rscript
2
+
3
+ # Get KEGG pathways from hsa identifiers.
4
+
5
+ ###################
6
+ # LIBRARIES
7
+ ###################
8
+
9
+ library(KEGGREST)
10
+ res <- keggLink("pathway", "hsa")
11
+ write.table(cbind(names(res), res), col.names=FALSE, quote=FALSE, sep="\t", row.names=FALSE, file = "kegg_pathways.txt")
12
+
data/bin/lines.R ADDED
@@ -0,0 +1,74 @@
1
+ #! /usr/bin/env Rscript
2
+
3
+ library(ggplot2)
4
+ library(optparse)
5
+
6
+ #####################
7
+ ## OPTPARSE
8
+ #####################
9
+ option_list <- list(
10
+ make_option(c("-d", "--data_file"), type="character",
11
+ help="Tabulated file with information about each sample"),
12
+ make_option(c("-o", "--output"), type="character", default="results",
13
+ help="Output figure file"),
14
+ make_option(c("-x", "--x_column"), type="character",
15
+ help="Name of column to be used for X dimension"),
16
+ make_option(c("-y", "--y_column"), type="character",
17
+ help="Name of column to be used for Y dimension"),
18
+ make_option(c("-s", "--set_column"), type="character", default="",
19
+ help="Name of column to be used on set groups"),
20
+ make_option(c("-L", "--no_legend"), action="store_true", default=FALSE,
21
+ help="Remove legend"),
22
+ make_option(c("-c", "--colours"), type="character", default="",
23
+ help="Define which color is asigned to each data series. List colours comma separated."),
24
+ make_option(c("-m", "--set_geom"), type="character", default="line",
25
+ help="Choose the type of graphical representation, using points or lines"),
26
+ make_option(c("-e", "--establish_limits"), action="store_true", default=FALSE,
27
+ help="Allow establishing limits for X and Y axis. If true, please set x_limit and y_limit"),
28
+ make_option(c("-X", "--x_limit"), type="integer", default=0,
29
+ help="Set x axis limit"),
30
+ make_option(c("-Y", "--y_limit"), type="integer", default=1,
31
+ help="Set y axis limit")
32
+
33
+ )
34
+ opt <- parse_args(OptionParser(option_list=option_list))
35
+
36
+
37
+ ################################################################
38
+ ## MAIN
39
+ ################################################################
40
+
41
+ data <- read.table(opt$data_file, sep="\t", header=TRUE)
42
+
43
+ pdf(paste(opt$output, '.pdf', sep=""))
44
+ if(opt$set_column != ""){
45
+ obj <- ggplot(data, aes(x=data[[opt$x_column]], y=data[[opt$y_column]], color=data[[opt$set_column]]))
46
+ }else{
47
+ obj <- ggplot(data, aes(x=data[[opt$x_column]], y=data[[opt$y_column]]))
48
+ }
49
+ if(opt$colours != ""){
50
+ colours <- unlist(strsplit(opt$colours, ','))
51
+ obj <- obj + scale_color_manual(values=c(colours))
52
+ }
53
+
54
+ if(opt$set_geom == 'point'){
55
+ obj <- obj + geom_point()
56
+ }else if(opt$set_geom == 'line'){
57
+ obj <- obj + geom_line()
58
+ }
59
+
60
+ obj <- obj + xlim(0, 1)
61
+ obj <- obj + ylim(0, 1)
62
+
63
+ if (opt$establish_limits){
64
+ obj <- obj + xlim(opt$x_limit, 1)
65
+ obj <- obj + ylim(0, opt$y_limit)
66
+ }
67
+ obj <- obj + xlab(opt$x_column)
68
+ obj <- obj + ylab(opt$y_column)
69
+ if(opt$no_legend){
70
+ obj <- obj + guides(color=FALSE)
71
+ }
72
+ obj
73
+ dev.off()
74
+
@@ -0,0 +1,139 @@
1
+ #! /usr/bin/env ruby
2
+
3
+ require 'optparse'
4
+
5
+ #################################
6
+ ## METHODS
7
+ #################################
8
+ def load_pairs(file, key)
9
+ STDERR.puts key
10
+ pairsA = {}
11
+ pairsB = {}
12
+ File.open(file).each do |line|
13
+ line.chomp!
14
+ fields = line.split("\t")
15
+ if fields.first =~ /#{key}/#.include?(key)
16
+ save_record(pairsA, fields.last, fields.first )
17
+ else
18
+ save_record(pairsB, fields.last, fields.first )
19
+ end
20
+ end
21
+ return pairsA, pairsB
22
+ end
23
+
24
+ def save_record(hash, key, val)
25
+ query = hash[key]
26
+ if query.nil?
27
+ hash[key] = [val]
28
+ else
29
+ query << val
30
+ end
31
+ end
32
+
33
+ def generate_files(n_files, output)
34
+ files = []
35
+ n_files.times do |n|
36
+ files << File.open("#{output}#{n+1}.txt", 'w')
37
+ end
38
+ return files
39
+ end
40
+
41
+ def connect_pairs_write(pairsA, pairsB, n_files, files)
42
+ pairsA.each do |keyA, valA|
43
+ valB = pairsB[keyA]
44
+ if !valB.nil?
45
+ valA.each do |vA|
46
+ valB.each do |vB|
47
+ files[rand(n_files)].puts "#{vA}\t#{keyA}\t#{vB}"
48
+ end
49
+ end
50
+ end
51
+ end
52
+ end
53
+
54
+ def get_relations(pairsA, pairsB)
55
+ relations = {}
56
+ pairsA.each do |keyA, valA|
57
+ valB = pairsB[keyA]
58
+ if !valB.nil?
59
+ valA.each do |vA|
60
+ valB.each do |vB|
61
+ rel_key = vA + '_' + vB
62
+ query = relations[rel_key]
63
+ if query.nil?
64
+ relations[rel_key] = [keyA]
65
+ else
66
+ query << keyA
67
+ end
68
+ end
69
+ end
70
+ end
71
+ end
72
+ return relations
73
+ end
74
+
75
+ ##############################
76
+ #OPTPARSE
77
+ ##############################
78
+
79
+ options = {}
80
+ OptionParser.new do |opts|
81
+ opts.banner = "Usage: #{__FILE__} [options]"
82
+
83
+ options[:input_file] = nil
84
+ opts.on("-i", "--input_file PATH", "Input file for create adjacency matrix") do |input_file|
85
+ options[:input_file] = input_file
86
+ end
87
+
88
+ options[:key] = ''
89
+ opts.on("-k", "--key STRING", "String to split th two groups") do |key|
90
+ options[:key] = key
91
+ end
92
+
93
+ options[:output] = 'tri_'
94
+ opts.on("-o", "--output PATH", "Output network pairs") do |output|
95
+ options[:output] = output
96
+ end
97
+
98
+ options[:n_files] = 10
99
+ opts.on("-n", "--n_files INTEGER", "Split data onto n files") do |n|
100
+ options[:n_files] = n.to_i
101
+ end
102
+
103
+ options[:min_connections] = 1
104
+ opts.on("-m", "--min_connections INTEGER", "Minimun connections to take into account a relation") do |n|
105
+ options[:min_connections] = n.to_i
106
+ end
107
+
108
+ end.parse!
109
+
110
+ ################################
111
+ ## MAIN
112
+ ################################
113
+ files = generate_files(options[:n_files], options[:output])
114
+
115
+ pairsA, pairsB = load_pairs(options[:input_file], options[:key])
116
+ if options[:min_connections] == 1
117
+ connect_pairs_write(pairsA, pairsB, options[:n_files], files)
118
+ else
119
+ STDERR.puts "MIN. NUMBER OF CONNECTIONS = #{options[:min_connections]}"
120
+ relations = get_relations(pairsA, pairsB)
121
+ count = 0
122
+ discarded = 0
123
+ relations.each do |rel, connections|
124
+ if connections.length >= options[:min_connections]
125
+ fields = rel.split('_')
126
+ connections.each do |con|
127
+ files[rand(options[:n_files])].puts "#{fields.first}\t#{con}\t#{fields.last}"
128
+ end
129
+ else
130
+ discarded += connections.length
131
+ end
132
+ count += connections.length
133
+ end
134
+ STDERR.puts "Relations: #{count}"
135
+ STDERR.puts "Discarded: #{discarded}"
136
+ end
137
+ files.each do |f|
138
+ f.close
139
+ end