DomFun 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,135 @@
1
+ #! /usr/bin/env ruby
2
+
3
+ require 'optparse'
4
+
5
+ ##########################
6
+ #MODULES
7
+ #########################
8
+
9
+ module Enumerable
10
+
11
+ def sum
12
+ self.inject(0){|accum, i| accum + i }
13
+ end
14
+
15
+ def mean
16
+ self.sum/self.length.to_f
17
+ end
18
+
19
+ def sample_variance
20
+ m = self.mean
21
+ sum = self.inject(0){|accum, i| accum +(i-m)**2 }
22
+ sum/(self.length - 1).to_f
23
+ end
24
+
25
+ def standard_deviation
26
+ Math.sqrt(self.sample_variance)
27
+ end
28
+
29
+ end
30
+
31
+ ##########################
32
+ #METHODS
33
+ #########################
34
+ # Corregir el input de combined scores
35
+
36
+ def load_predictions(input_file)
37
+ predictions = {}
38
+ File.open(input_file).each do |line|
39
+ line.chomp!
40
+ proteinID, domains, go_term, p_value = line.split("\t")
41
+ query = predictions[proteinID]
42
+ if query.nil?
43
+ predictions[proteinID] = [[go_term, p_value.to_f]]
44
+ else
45
+ query << [go_term, p_value.to_f]
46
+ end
47
+ end
48
+ return predictions
49
+ end
50
+
51
+ def translate_protein_ids(cafa_file, predictions)
52
+ cafa_predictions = []
53
+ File.open(cafa_file).each do |line|
54
+ line.chomp!
55
+ cafa_data = line.split("\t")
56
+ cafaID = cafa_data[3]
57
+ proteinID = cafa_data[6]
58
+ go_associations = predictions[proteinID]
59
+ unless go_associations.nil?
60
+ go_associations.each do |goID, value|
61
+ cafa_predictions << [cafaID, goID, value]
62
+ end
63
+ end
64
+ end
65
+ return cafa_predictions
66
+ end
67
+
68
+ def normalize_association_values(cafa_predictions)
69
+ raw_association_values = []
70
+ normalized_prediction_values = []
71
+ cafa_predictions.each do |protID, goTerm, value|
72
+ raw_association_values << value.to_f
73
+ end
74
+ raw_values_mean = raw_association_values.mean
75
+ raw_values_sd = raw_association_values.standard_deviation
76
+ association_values_minus_mean = []
77
+ raw_association_values.each do |value|
78
+ z_score = (value - raw_values_mean).fdiv(raw_values_sd)
79
+ if z_score > 2
80
+ z_score = 2
81
+ elsif z_score < -2
82
+ z_score = -2
83
+ end
84
+ z_score = z_score.fdiv(5) + 0.5
85
+ association_values_minus_mean << z_score
86
+ end
87
+ cafa_predictions.each_with_index do |info, c|
88
+ protID, goTerm, association_values = info
89
+ normalized_prediction_values << [protID, goTerm, association_values_minus_mean[c]]
90
+ end
91
+ cafa_predictions = normalized_prediction_values
92
+ return cafa_predictions
93
+ end
94
+
95
+ ##########################
96
+ #OPT-PARSER
97
+ ##########################
98
+ options = {}
99
+ OptionParser.new do |opts|
100
+ opts.banner = "Usage: #{__FILE__} [options]"
101
+
102
+ options[:input_predictions] = nil
103
+ opts.on("-a", "--input_predictions PATH", "Input predictions file") do |input_predictions|
104
+ options[:input_predictions] = input_predictions
105
+ end
106
+
107
+ options[:input_cafa] = nil
108
+ opts.on("-c", "--input_cafa PATH", "Input CAFA file to translate UniProtIDs to CAFAIDs") do |input_cafa|
109
+ options[:input_cafa] = input_cafa
110
+ end
111
+
112
+ options[:output_file] = 'results_to_CAFA2_validation.txt'
113
+ opts.on("-o", "--output_file PATH", "Output file") do |output_file|
114
+ options[:output_file] = output_file
115
+ end
116
+
117
+ options[:do_norm] = false
118
+ opts.on("-n", "--do_norm", "Normalize prediction values") do
119
+ options[:do_norm] = true
120
+ end
121
+
122
+ end.parse!
123
+
124
+ ##########################
125
+ #MAIN
126
+ ##########################
127
+
128
+ predictions = load_predictions(options[:input_predictions])
129
+ cafa_predictions = translate_protein_ids(options[:input_cafa], predictions)
130
+ cafa_predictions = normalize_association_values(cafa_predictions) if options[:do_norm]
131
+
132
+ handler = File.open(options[:output_file], 'w')
133
+ cafa_predictions.each do |info|
134
+ handler.puts "#{info.join("\t")}"
135
+ end
@@ -0,0 +1,139 @@
1
+ #! /usr/bin/env ruby
2
+ ##########################
3
+ # Rojano E. & Seoane P., July 2019
4
+ # Generate training and testing datasets from CAFA2 and UniProt data.
5
+ # This script translate identifiers and generate the file for the
6
+ # construction of tripartite network (domain-protein-FunSys)
7
+ ##########################
8
+
9
+ REPORT_FOLDER=File.expand_path(File.join(File.dirname(__FILE__), '..', 'templates'))
10
+ ROOT_PATH = File.dirname(__FILE__)
11
+ $: << File.expand_path(File.join(ROOT_PATH, '..', 'lib', 'DomFun'))
12
+ require 'generalMethods.rb'
13
+ require 'optparse'
14
+ require 'csv'
15
+
16
+ ##########################
17
+ #METHODS
18
+ ##########################
19
+
20
+ def load_hash(filename, mode)
21
+ container = {}
22
+ File.open(filename).each do |line|
23
+ line.chomp!
24
+ key, value = line.split("\t") if mode == 'a'
25
+ value, key = line.split("\t") if mode == 'b'
26
+ query = container[key]
27
+ if query.nil?
28
+ container[key] = [value]
29
+ else
30
+ query << value
31
+ end
32
+ end
33
+ return container
34
+ end
35
+
36
+ def build_tripartite_network(domain_tuples, annot_tuples, filename)
37
+ handler = File.open(filename, 'w')
38
+ annot_tuples.each do |protID, annots|
39
+ annots.each do |a|
40
+ handler.puts "#{a}\t#{protID}"
41
+ end
42
+ end
43
+ domain_tuples.each do |protID, doms|
44
+ doms.each do |d|
45
+ handler.puts "#{d}\t#{protID}"
46
+ end
47
+ end
48
+ handler.close
49
+ end
50
+
51
+ def load_cath_info(cath_file, domain_class)
52
+ proteins_2_domains = {}
53
+ File.open(cath_file).each do |line|
54
+ line.chomp!
55
+ protID, superfam, funfam = line.split("\t")
56
+ query = proteins_2_domains[protID]
57
+ if domain_class == 'superfamilyID'
58
+ domain_type = superfam
59
+ else
60
+ domain_type = funfam
61
+ end
62
+ if query.nil?
63
+ proteins_2_domains[protID] = [domain_type]
64
+ else
65
+ query << domain_type
66
+ end
67
+
68
+ end
69
+ return proteins_2_domains
70
+ end
71
+
72
+ def generate_tuples(prot_annots, prot_domains)
73
+ domain_tuples = {}
74
+ annot_tuples = {}
75
+ prot_annots.each do |protID, annots|
76
+ domains = prot_domains[protID]
77
+ unless domains.nil?
78
+ domains.each do |d|
79
+ query = domain_tuples[protID]
80
+ if query.nil?
81
+ domain_tuples[protID] = [d]
82
+ else
83
+ query << d unless query.include?(d)
84
+ end
85
+ end
86
+ end
87
+ annots.each do |a|
88
+ query = annot_tuples[protID]
89
+ if query.nil?
90
+ annot_tuples[protID] = [a]
91
+ else
92
+ query << a unless query.include?(a)
93
+ end
94
+ end
95
+ end
96
+ return domain_tuples, annot_tuples
97
+ end
98
+
99
+ ##########################
100
+ #OPT-PARSER
101
+ ##########################
102
+ options = {}
103
+ OptionParser.new do |opts|
104
+ opts.banner = "Usage: #{__FILE__} [options]"
105
+
106
+ options[:protein_domains] = nil
107
+ opts.on("-a", "--protein_domains PATH", "Training proteins with CATH domains") do |data|
108
+ options[:protein_domains] = data
109
+ end
110
+
111
+ options[:annotated_proteins] = nil
112
+ opts.on("-b", "--annotated_proteins PATH", "Training proteins with annotations") do |data|
113
+ options[:annotated_proteins] = data
114
+ end
115
+
116
+ options[:domain_class] = 'funfamID'
117
+ opts.on("-d", "--domain_class STRING", "Domain identifiers type. Please choose funfamID or superfamilyID") do |data|
118
+ options[:domain_class] = data
119
+ end
120
+
121
+ options[:output_network] = 'tripartite_network.txt'
122
+ opts.on("-o", "--output_network PATH", "Output tripartite network from CAFA information") do |data|
123
+ options[:output_network] = data
124
+ end
125
+
126
+ opts.on_tail("-h", "--help", "Tool information") do
127
+ puts opts
128
+ exit
129
+ end
130
+
131
+ end.parse!
132
+
133
+ ##########################
134
+ #MAIN
135
+ ##########################
136
+ proteins_with_annotations = load_hash(options[:annotated_proteins], 'a')
137
+ proteins_2_domains = load_cath_info(options[:protein_domains], options[:domain_class])
138
+ domain_tuples, annot_tuples = generate_tuples(proteins_with_annotations, proteins_2_domains)
139
+ build_tripartite_network(domain_tuples, annot_tuples, options[:output_network])
@@ -0,0 +1,45 @@
1
+ #! /usr/bin/env ruby
2
+ ##########################
3
+ # Rojano E. & Seoane P., July 2019
4
+ # Generate control from CAFA data (proteins and GO terms)
5
+ ##########################
6
+
7
+ REPORT_FOLDER=File.expand_path(File.join(File.dirname(__FILE__), '..', 'templates'))
8
+ ROOT_PATH = File.dirname(__FILE__)
9
+ $: << File.expand_path(File.join(ROOT_PATH, '..', 'lib', 'DomFun'))
10
+ require 'optparse'
11
+ require 'generalMethods.rb'
12
+
13
+ ##########################
14
+ #OPT-PARSER
15
+ ##########################
16
+ options = {}
17
+ OptionParser.new do |opts|
18
+ opts.banner = "Usage: #{__FILE__} [options]"
19
+
20
+ options[:input_cafa] = nil
21
+ opts.on("-a", "--input_cafa PATH", "Input CAFA file") do |data|
22
+ options[:input_cafa] = data
23
+ end
24
+
25
+ options[:output_file] = 'cafa_control.txt'
26
+ opts.on("-o", "--output_file PATH", "Output control from CAFA data") do |data|
27
+ options[:output_file] = data
28
+ end
29
+
30
+ opts.on_tail("-h", "--help", "Tool information") do
31
+ puts opts
32
+ exit
33
+ end
34
+
35
+ end.parse!
36
+
37
+ ##########################
38
+ #MAIN
39
+ ##########################
40
+ cafa_data = load_cafa_data(options[:input_cafa])
41
+ handler = File.open(options[:output_file], 'w')
42
+ cafa_data.each do |protein, goTerms|
43
+ handler.puts "#{protein}\t#{goTerms.join(';')}"
44
+ end
45
+ handler.close
@@ -0,0 +1,12 @@
1
+ #! /usr/bin/env Rscript
2
+
3
+ # Get KEGG pathways from hsa identifiers.
4
+
5
+ ###################
6
+ # LIBRARIES
7
+ ###################
8
+
9
+ library(KEGGREST)
10
+ res <- keggLink("pathway", "hsa")
11
+ write.table(cbind(names(res), res), col.names=FALSE, quote=FALSE, sep="\t", row.names=FALSE, file = "kegg_pathways.txt")
12
+
data/bin/lines.R ADDED
@@ -0,0 +1,74 @@
1
+ #! /usr/bin/env Rscript
2
+
3
+ library(ggplot2)
4
+ library(optparse)
5
+
6
+ #####################
7
+ ## OPTPARSE
8
+ #####################
9
+ option_list <- list(
10
+ make_option(c("-d", "--data_file"), type="character",
11
+ help="Tabulated file with information about each sample"),
12
+ make_option(c("-o", "--output"), type="character", default="results",
13
+ help="Output figure file"),
14
+ make_option(c("-x", "--x_column"), type="character",
15
+ help="Name of column to be used for X dimension"),
16
+ make_option(c("-y", "--y_column"), type="character",
17
+ help="Name of column to be used for Y dimension"),
18
+ make_option(c("-s", "--set_column"), type="character", default="",
19
+ help="Name of column to be used on set groups"),
20
+ make_option(c("-L", "--no_legend"), action="store_true", default=FALSE,
21
+ help="Remove legend"),
22
+ make_option(c("-c", "--colours"), type="character", default="",
23
+ help="Define which color is asigned to each data series. List colours comma separated."),
24
+ make_option(c("-m", "--set_geom"), type="character", default="line",
25
+ help="Choose the type of graphical representation, using points or lines"),
26
+ make_option(c("-e", "--establish_limits"), action="store_true", default=FALSE,
27
+ help="Allow establishing limits for X and Y axis. If true, please set x_limit and y_limit"),
28
+ make_option(c("-X", "--x_limit"), type="integer", default=0,
29
+ help="Set x axis limit"),
30
+ make_option(c("-Y", "--y_limit"), type="integer", default=1,
31
+ help="Set y axis limit")
32
+
33
+ )
34
+ opt <- parse_args(OptionParser(option_list=option_list))
35
+
36
+
37
+ ################################################################
38
+ ## MAIN
39
+ ################################################################
40
+
41
+ data <- read.table(opt$data_file, sep="\t", header=TRUE)
42
+
43
+ pdf(paste(opt$output, '.pdf', sep=""))
44
+ if(opt$set_column != ""){
45
+ obj <- ggplot(data, aes(x=data[[opt$x_column]], y=data[[opt$y_column]], color=data[[opt$set_column]]))
46
+ }else{
47
+ obj <- ggplot(data, aes(x=data[[opt$x_column]], y=data[[opt$y_column]]))
48
+ }
49
+ if(opt$colours != ""){
50
+ colours <- unlist(strsplit(opt$colours, ','))
51
+ obj <- obj + scale_color_manual(values=c(colours))
52
+ }
53
+
54
+ if(opt$set_geom == 'point'){
55
+ obj <- obj + geom_point()
56
+ }else if(opt$set_geom == 'line'){
57
+ obj <- obj + geom_line()
58
+ }
59
+
60
+ obj <- obj + xlim(0, 1)
61
+ obj <- obj + ylim(0, 1)
62
+
63
+ if (opt$establish_limits){
64
+ obj <- obj + xlim(opt$x_limit, 1)
65
+ obj <- obj + ylim(0, opt$y_limit)
66
+ }
67
+ obj <- obj + xlab(opt$x_column)
68
+ obj <- obj + ylab(opt$y_column)
69
+ if(opt$no_legend){
70
+ obj <- obj + guides(color=FALSE)
71
+ }
72
+ obj
73
+ dev.off()
74
+
@@ -0,0 +1,139 @@
1
+ #! /usr/bin/env ruby
2
+
3
+ require 'optparse'
4
+
5
+ #################################
6
+ ## METHODS
7
+ #################################
8
+ def load_pairs(file, key)
9
+ STDERR.puts key
10
+ pairsA = {}
11
+ pairsB = {}
12
+ File.open(file).each do |line|
13
+ line.chomp!
14
+ fields = line.split("\t")
15
+ if fields.first =~ /#{key}/#.include?(key)
16
+ save_record(pairsA, fields.last, fields.first )
17
+ else
18
+ save_record(pairsB, fields.last, fields.first )
19
+ end
20
+ end
21
+ return pairsA, pairsB
22
+ end
23
+
24
+ def save_record(hash, key, val)
25
+ query = hash[key]
26
+ if query.nil?
27
+ hash[key] = [val]
28
+ else
29
+ query << val
30
+ end
31
+ end
32
+
33
+ def generate_files(n_files, output)
34
+ files = []
35
+ n_files.times do |n|
36
+ files << File.open("#{output}#{n+1}.txt", 'w')
37
+ end
38
+ return files
39
+ end
40
+
41
+ def connect_pairs_write(pairsA, pairsB, n_files, files)
42
+ pairsA.each do |keyA, valA|
43
+ valB = pairsB[keyA]
44
+ if !valB.nil?
45
+ valA.each do |vA|
46
+ valB.each do |vB|
47
+ files[rand(n_files)].puts "#{vA}\t#{keyA}\t#{vB}"
48
+ end
49
+ end
50
+ end
51
+ end
52
+ end
53
+
54
+ def get_relations(pairsA, pairsB)
55
+ relations = {}
56
+ pairsA.each do |keyA, valA|
57
+ valB = pairsB[keyA]
58
+ if !valB.nil?
59
+ valA.each do |vA|
60
+ valB.each do |vB|
61
+ rel_key = vA + '_' + vB
62
+ query = relations[rel_key]
63
+ if query.nil?
64
+ relations[rel_key] = [keyA]
65
+ else
66
+ query << keyA
67
+ end
68
+ end
69
+ end
70
+ end
71
+ end
72
+ return relations
73
+ end
74
+
75
+ ##############################
76
+ #OPTPARSE
77
+ ##############################
78
+
79
+ options = {}
80
+ OptionParser.new do |opts|
81
+ opts.banner = "Usage: #{__FILE__} [options]"
82
+
83
+ options[:input_file] = nil
84
+ opts.on("-i", "--input_file PATH", "Input file for create adjacency matrix") do |input_file|
85
+ options[:input_file] = input_file
86
+ end
87
+
88
+ options[:key] = ''
89
+ opts.on("-k", "--key STRING", "String to split th two groups") do |key|
90
+ options[:key] = key
91
+ end
92
+
93
+ options[:output] = 'tri_'
94
+ opts.on("-o", "--output PATH", "Output network pairs") do |output|
95
+ options[:output] = output
96
+ end
97
+
98
+ options[:n_files] = 10
99
+ opts.on("-n", "--n_files INTEGER", "Split data onto n files") do |n|
100
+ options[:n_files] = n.to_i
101
+ end
102
+
103
+ options[:min_connections] = 1
104
+ opts.on("-m", "--min_connections INTEGER", "Minimun connections to take into account a relation") do |n|
105
+ options[:min_connections] = n.to_i
106
+ end
107
+
108
+ end.parse!
109
+
110
+ ################################
111
+ ## MAIN
112
+ ################################
113
+ files = generate_files(options[:n_files], options[:output])
114
+
115
+ pairsA, pairsB = load_pairs(options[:input_file], options[:key])
116
+ if options[:min_connections] == 1
117
+ connect_pairs_write(pairsA, pairsB, options[:n_files], files)
118
+ else
119
+ STDERR.puts "MIN. NUMBER OF CONNECTIONS = #{options[:min_connections]}"
120
+ relations = get_relations(pairsA, pairsB)
121
+ count = 0
122
+ discarded = 0
123
+ relations.each do |rel, connections|
124
+ if connections.length >= options[:min_connections]
125
+ fields = rel.split('_')
126
+ connections.each do |con|
127
+ files[rand(options[:n_files])].puts "#{fields.first}\t#{con}\t#{fields.last}"
128
+ end
129
+ else
130
+ discarded += connections.length
131
+ end
132
+ count += connections.length
133
+ end
134
+ STDERR.puts "Relations: #{count}"
135
+ STDERR.puts "Discarded: #{discarded}"
136
+ end
137
+ files.each do |f|
138
+ f.close
139
+ end