DomFun 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +11 -0
- data/.rspec +3 -0
- data/.travis.yml +7 -0
- data/DomFun.gemspec +44 -0
- data/Gemfile +4 -0
- data/LICENSE.txt +21 -0
- data/README.md +39 -0
- data/Rakefile +6 -0
- data/bin/add_protein_functional_families.rb +133 -0
- data/bin/console +14 -0
- data/bin/domains_to_function_predictor.rb +287 -0
- data/bin/generate_CAFA2_dataset.rb +135 -0
- data/bin/generate_CAFA2_tripartite_network.rb +139 -0
- data/bin/generate_cafa_control.rb +45 -0
- data/bin/get_kegg_pathways.R +12 -0
- data/bin/lines.R +74 -0
- data/bin/merge_pairs.rb +139 -0
- data/bin/normalize_combined_scores.rb +118 -0
- data/bin/prepare_cafa_network.rb +96 -0
- data/bin/setup +8 -0
- data/bin/standardize_scores.R +53 -0
- data/bin/translate_kegg_genes2pathways.rb +98 -0
- data/bin/validate_ProtFunSys_predictions.rb +174 -0
- data/lib/DomFun.rb +6 -0
- data/lib/DomFun/generalMethods.rb +105 -0
- data/lib/DomFun/version.rb +3 -0
- metadata +128 -0
@@ -0,0 +1,135 @@
|
|
1
|
+
#! /usr/bin/env ruby
|
2
|
+
|
3
|
+
require 'optparse'
|
4
|
+
|
5
|
+
##########################
|
6
|
+
#MODULES
|
7
|
+
#########################
|
8
|
+
|
9
|
+
module Enumerable
|
10
|
+
|
11
|
+
def sum
|
12
|
+
self.inject(0){|accum, i| accum + i }
|
13
|
+
end
|
14
|
+
|
15
|
+
def mean
|
16
|
+
self.sum/self.length.to_f
|
17
|
+
end
|
18
|
+
|
19
|
+
def sample_variance
|
20
|
+
m = self.mean
|
21
|
+
sum = self.inject(0){|accum, i| accum +(i-m)**2 }
|
22
|
+
sum/(self.length - 1).to_f
|
23
|
+
end
|
24
|
+
|
25
|
+
def standard_deviation
|
26
|
+
Math.sqrt(self.sample_variance)
|
27
|
+
end
|
28
|
+
|
29
|
+
end
|
30
|
+
|
31
|
+
##########################
|
32
|
+
#METHODS
|
33
|
+
#########################
|
34
|
+
# Corregir el input de combined scores
|
35
|
+
|
36
|
+
def load_predictions(input_file)
|
37
|
+
predictions = {}
|
38
|
+
File.open(input_file).each do |line|
|
39
|
+
line.chomp!
|
40
|
+
proteinID, domains, go_term, p_value = line.split("\t")
|
41
|
+
query = predictions[proteinID]
|
42
|
+
if query.nil?
|
43
|
+
predictions[proteinID] = [[go_term, p_value.to_f]]
|
44
|
+
else
|
45
|
+
query << [go_term, p_value.to_f]
|
46
|
+
end
|
47
|
+
end
|
48
|
+
return predictions
|
49
|
+
end
|
50
|
+
|
51
|
+
def translate_protein_ids(cafa_file, predictions)
|
52
|
+
cafa_predictions = []
|
53
|
+
File.open(cafa_file).each do |line|
|
54
|
+
line.chomp!
|
55
|
+
cafa_data = line.split("\t")
|
56
|
+
cafaID = cafa_data[3]
|
57
|
+
proteinID = cafa_data[6]
|
58
|
+
go_associations = predictions[proteinID]
|
59
|
+
unless go_associations.nil?
|
60
|
+
go_associations.each do |goID, value|
|
61
|
+
cafa_predictions << [cafaID, goID, value]
|
62
|
+
end
|
63
|
+
end
|
64
|
+
end
|
65
|
+
return cafa_predictions
|
66
|
+
end
|
67
|
+
|
68
|
+
def normalize_association_values(cafa_predictions)
|
69
|
+
raw_association_values = []
|
70
|
+
normalized_prediction_values = []
|
71
|
+
cafa_predictions.each do |protID, goTerm, value|
|
72
|
+
raw_association_values << value.to_f
|
73
|
+
end
|
74
|
+
raw_values_mean = raw_association_values.mean
|
75
|
+
raw_values_sd = raw_association_values.standard_deviation
|
76
|
+
association_values_minus_mean = []
|
77
|
+
raw_association_values.each do |value|
|
78
|
+
z_score = (value - raw_values_mean).fdiv(raw_values_sd)
|
79
|
+
if z_score > 2
|
80
|
+
z_score = 2
|
81
|
+
elsif z_score < -2
|
82
|
+
z_score = -2
|
83
|
+
end
|
84
|
+
z_score = z_score.fdiv(5) + 0.5
|
85
|
+
association_values_minus_mean << z_score
|
86
|
+
end
|
87
|
+
cafa_predictions.each_with_index do |info, c|
|
88
|
+
protID, goTerm, association_values = info
|
89
|
+
normalized_prediction_values << [protID, goTerm, association_values_minus_mean[c]]
|
90
|
+
end
|
91
|
+
cafa_predictions = normalized_prediction_values
|
92
|
+
return cafa_predictions
|
93
|
+
end
|
94
|
+
|
95
|
+
##########################
|
96
|
+
#OPT-PARSER
|
97
|
+
##########################
|
98
|
+
options = {}
|
99
|
+
OptionParser.new do |opts|
|
100
|
+
opts.banner = "Usage: #{__FILE__} [options]"
|
101
|
+
|
102
|
+
options[:input_predictions] = nil
|
103
|
+
opts.on("-a", "--input_predictions PATH", "Input predictions file") do |input_predictions|
|
104
|
+
options[:input_predictions] = input_predictions
|
105
|
+
end
|
106
|
+
|
107
|
+
options[:input_cafa] = nil
|
108
|
+
opts.on("-c", "--input_cafa PATH", "Input CAFA file to translate UniProtIDs to CAFAIDs") do |input_cafa|
|
109
|
+
options[:input_cafa] = input_cafa
|
110
|
+
end
|
111
|
+
|
112
|
+
options[:output_file] = 'results_to_CAFA2_validation.txt'
|
113
|
+
opts.on("-o", "--output_file PATH", "Output file") do |output_file|
|
114
|
+
options[:output_file] = output_file
|
115
|
+
end
|
116
|
+
|
117
|
+
options[:do_norm] = false
|
118
|
+
opts.on("-n", "--do_norm", "Normalize prediction values") do
|
119
|
+
options[:do_norm] = true
|
120
|
+
end
|
121
|
+
|
122
|
+
end.parse!
|
123
|
+
|
124
|
+
##########################
|
125
|
+
#MAIN
|
126
|
+
##########################
|
127
|
+
|
128
|
+
predictions = load_predictions(options[:input_predictions])
|
129
|
+
cafa_predictions = translate_protein_ids(options[:input_cafa], predictions)
|
130
|
+
cafa_predictions = normalize_association_values(cafa_predictions) if options[:do_norm]
|
131
|
+
|
132
|
+
handler = File.open(options[:output_file], 'w')
|
133
|
+
cafa_predictions.each do |info|
|
134
|
+
handler.puts "#{info.join("\t")}"
|
135
|
+
end
|
@@ -0,0 +1,139 @@
|
|
1
|
+
#! /usr/bin/env ruby
|
2
|
+
##########################
|
3
|
+
# Rojano E. & Seoane P., July 2019
|
4
|
+
# Generate training and testing datasets from CAFA2 and UniProt data.
|
5
|
+
# This script translate identifiers and generate the file for the
|
6
|
+
# construction of tripartite network (domain-protein-FunSys)
|
7
|
+
##########################
|
8
|
+
|
9
|
+
REPORT_FOLDER=File.expand_path(File.join(File.dirname(__FILE__), '..', 'templates'))
|
10
|
+
ROOT_PATH = File.dirname(__FILE__)
|
11
|
+
$: << File.expand_path(File.join(ROOT_PATH, '..', 'lib', 'DomFun'))
|
12
|
+
require 'generalMethods.rb'
|
13
|
+
require 'optparse'
|
14
|
+
require 'csv'
|
15
|
+
|
16
|
+
##########################
|
17
|
+
#METHODS
|
18
|
+
##########################
|
19
|
+
|
20
|
+
def load_hash(filename, mode)
|
21
|
+
container = {}
|
22
|
+
File.open(filename).each do |line|
|
23
|
+
line.chomp!
|
24
|
+
key, value = line.split("\t") if mode == 'a'
|
25
|
+
value, key = line.split("\t") if mode == 'b'
|
26
|
+
query = container[key]
|
27
|
+
if query.nil?
|
28
|
+
container[key] = [value]
|
29
|
+
else
|
30
|
+
query << value
|
31
|
+
end
|
32
|
+
end
|
33
|
+
return container
|
34
|
+
end
|
35
|
+
|
36
|
+
def build_tripartite_network(domain_tuples, annot_tuples, filename)
|
37
|
+
handler = File.open(filename, 'w')
|
38
|
+
annot_tuples.each do |protID, annots|
|
39
|
+
annots.each do |a|
|
40
|
+
handler.puts "#{a}\t#{protID}"
|
41
|
+
end
|
42
|
+
end
|
43
|
+
domain_tuples.each do |protID, doms|
|
44
|
+
doms.each do |d|
|
45
|
+
handler.puts "#{d}\t#{protID}"
|
46
|
+
end
|
47
|
+
end
|
48
|
+
handler.close
|
49
|
+
end
|
50
|
+
|
51
|
+
def load_cath_info(cath_file, domain_class)
|
52
|
+
proteins_2_domains = {}
|
53
|
+
File.open(cath_file).each do |line|
|
54
|
+
line.chomp!
|
55
|
+
protID, superfam, funfam = line.split("\t")
|
56
|
+
query = proteins_2_domains[protID]
|
57
|
+
if domain_class == 'superfamilyID'
|
58
|
+
domain_type = superfam
|
59
|
+
else
|
60
|
+
domain_type = funfam
|
61
|
+
end
|
62
|
+
if query.nil?
|
63
|
+
proteins_2_domains[protID] = [domain_type]
|
64
|
+
else
|
65
|
+
query << domain_type
|
66
|
+
end
|
67
|
+
|
68
|
+
end
|
69
|
+
return proteins_2_domains
|
70
|
+
end
|
71
|
+
|
72
|
+
def generate_tuples(prot_annots, prot_domains)
|
73
|
+
domain_tuples = {}
|
74
|
+
annot_tuples = {}
|
75
|
+
prot_annots.each do |protID, annots|
|
76
|
+
domains = prot_domains[protID]
|
77
|
+
unless domains.nil?
|
78
|
+
domains.each do |d|
|
79
|
+
query = domain_tuples[protID]
|
80
|
+
if query.nil?
|
81
|
+
domain_tuples[protID] = [d]
|
82
|
+
else
|
83
|
+
query << d unless query.include?(d)
|
84
|
+
end
|
85
|
+
end
|
86
|
+
end
|
87
|
+
annots.each do |a|
|
88
|
+
query = annot_tuples[protID]
|
89
|
+
if query.nil?
|
90
|
+
annot_tuples[protID] = [a]
|
91
|
+
else
|
92
|
+
query << a unless query.include?(a)
|
93
|
+
end
|
94
|
+
end
|
95
|
+
end
|
96
|
+
return domain_tuples, annot_tuples
|
97
|
+
end
|
98
|
+
|
99
|
+
##########################
|
100
|
+
#OPT-PARSER
|
101
|
+
##########################
|
102
|
+
options = {}
|
103
|
+
OptionParser.new do |opts|
|
104
|
+
opts.banner = "Usage: #{__FILE__} [options]"
|
105
|
+
|
106
|
+
options[:protein_domains] = nil
|
107
|
+
opts.on("-a", "--protein_domains PATH", "Training proteins with CATH domains") do |data|
|
108
|
+
options[:protein_domains] = data
|
109
|
+
end
|
110
|
+
|
111
|
+
options[:annotated_proteins] = nil
|
112
|
+
opts.on("-b", "--annotated_proteins PATH", "Training proteins with annotations") do |data|
|
113
|
+
options[:annotated_proteins] = data
|
114
|
+
end
|
115
|
+
|
116
|
+
options[:domain_class] = 'funfamID'
|
117
|
+
opts.on("-d", "--domain_class STRING", "Domain identifiers type. Please choose funfamID or superfamilyID") do |data|
|
118
|
+
options[:domain_class] = data
|
119
|
+
end
|
120
|
+
|
121
|
+
options[:output_network] = 'tripartite_network.txt'
|
122
|
+
opts.on("-o", "--output_network PATH", "Output tripartite network from CAFA information") do |data|
|
123
|
+
options[:output_network] = data
|
124
|
+
end
|
125
|
+
|
126
|
+
opts.on_tail("-h", "--help", "Tool information") do
|
127
|
+
puts opts
|
128
|
+
exit
|
129
|
+
end
|
130
|
+
|
131
|
+
end.parse!
|
132
|
+
|
133
|
+
##########################
|
134
|
+
#MAIN
|
135
|
+
##########################
|
136
|
+
proteins_with_annotations = load_hash(options[:annotated_proteins], 'a')
|
137
|
+
proteins_2_domains = load_cath_info(options[:protein_domains], options[:domain_class])
|
138
|
+
domain_tuples, annot_tuples = generate_tuples(proteins_with_annotations, proteins_2_domains)
|
139
|
+
build_tripartite_network(domain_tuples, annot_tuples, options[:output_network])
|
@@ -0,0 +1,45 @@
|
|
1
|
+
#! /usr/bin/env ruby
|
2
|
+
##########################
|
3
|
+
# Rojano E. & Seoane P., July 2019
|
4
|
+
# Generate control from CAFA data (proteins and GO terms)
|
5
|
+
##########################
|
6
|
+
|
7
|
+
REPORT_FOLDER=File.expand_path(File.join(File.dirname(__FILE__), '..', 'templates'))
|
8
|
+
ROOT_PATH = File.dirname(__FILE__)
|
9
|
+
$: << File.expand_path(File.join(ROOT_PATH, '..', 'lib', 'DomFun'))
|
10
|
+
require 'optparse'
|
11
|
+
require 'generalMethods.rb'
|
12
|
+
|
13
|
+
##########################
|
14
|
+
#OPT-PARSER
|
15
|
+
##########################
|
16
|
+
options = {}
|
17
|
+
OptionParser.new do |opts|
|
18
|
+
opts.banner = "Usage: #{__FILE__} [options]"
|
19
|
+
|
20
|
+
options[:input_cafa] = nil
|
21
|
+
opts.on("-a", "--input_cafa PATH", "Input CAFA file") do |data|
|
22
|
+
options[:input_cafa] = data
|
23
|
+
end
|
24
|
+
|
25
|
+
options[:output_file] = 'cafa_control.txt'
|
26
|
+
opts.on("-o", "--output_file PATH", "Output control from CAFA data") do |data|
|
27
|
+
options[:output_file] = data
|
28
|
+
end
|
29
|
+
|
30
|
+
opts.on_tail("-h", "--help", "Tool information") do
|
31
|
+
puts opts
|
32
|
+
exit
|
33
|
+
end
|
34
|
+
|
35
|
+
end.parse!
|
36
|
+
|
37
|
+
##########################
|
38
|
+
#MAIN
|
39
|
+
##########################
|
40
|
+
cafa_data = load_cafa_data(options[:input_cafa])
|
41
|
+
handler = File.open(options[:output_file], 'w')
|
42
|
+
cafa_data.each do |protein, goTerms|
|
43
|
+
handler.puts "#{protein}\t#{goTerms.join(';')}"
|
44
|
+
end
|
45
|
+
handler.close
|
@@ -0,0 +1,12 @@
|
|
1
|
+
#! /usr/bin/env Rscript
|
2
|
+
|
3
|
+
# Get KEGG pathways from hsa identifiers.
|
4
|
+
|
5
|
+
###################
|
6
|
+
# LIBRARIES
|
7
|
+
###################
|
8
|
+
|
9
|
+
library(KEGGREST)
|
10
|
+
res <- keggLink("pathway", "hsa")
|
11
|
+
write.table(cbind(names(res), res), col.names=FALSE, quote=FALSE, sep="\t", row.names=FALSE, file = "kegg_pathways.txt")
|
12
|
+
|
data/bin/lines.R
ADDED
@@ -0,0 +1,74 @@
|
|
1
|
+
#! /usr/bin/env Rscript
|
2
|
+
|
3
|
+
library(ggplot2)
|
4
|
+
library(optparse)
|
5
|
+
|
6
|
+
#####################
|
7
|
+
## OPTPARSE
|
8
|
+
#####################
|
9
|
+
option_list <- list(
|
10
|
+
make_option(c("-d", "--data_file"), type="character",
|
11
|
+
help="Tabulated file with information about each sample"),
|
12
|
+
make_option(c("-o", "--output"), type="character", default="results",
|
13
|
+
help="Output figure file"),
|
14
|
+
make_option(c("-x", "--x_column"), type="character",
|
15
|
+
help="Name of column to be used for X dimension"),
|
16
|
+
make_option(c("-y", "--y_column"), type="character",
|
17
|
+
help="Name of column to be used for Y dimension"),
|
18
|
+
make_option(c("-s", "--set_column"), type="character", default="",
|
19
|
+
help="Name of column to be used on set groups"),
|
20
|
+
make_option(c("-L", "--no_legend"), action="store_true", default=FALSE,
|
21
|
+
help="Remove legend"),
|
22
|
+
make_option(c("-c", "--colours"), type="character", default="",
|
23
|
+
help="Define which color is asigned to each data series. List colours comma separated."),
|
24
|
+
make_option(c("-m", "--set_geom"), type="character", default="line",
|
25
|
+
help="Choose the type of graphical representation, using points or lines"),
|
26
|
+
make_option(c("-e", "--establish_limits"), action="store_true", default=FALSE,
|
27
|
+
help="Allow establishing limits for X and Y axis. If true, please set x_limit and y_limit"),
|
28
|
+
make_option(c("-X", "--x_limit"), type="integer", default=0,
|
29
|
+
help="Set x axis limit"),
|
30
|
+
make_option(c("-Y", "--y_limit"), type="integer", default=1,
|
31
|
+
help="Set y axis limit")
|
32
|
+
|
33
|
+
)
|
34
|
+
opt <- parse_args(OptionParser(option_list=option_list))
|
35
|
+
|
36
|
+
|
37
|
+
################################################################
|
38
|
+
## MAIN
|
39
|
+
################################################################
|
40
|
+
|
41
|
+
data <- read.table(opt$data_file, sep="\t", header=TRUE)
|
42
|
+
|
43
|
+
pdf(paste(opt$output, '.pdf', sep=""))
|
44
|
+
if(opt$set_column != ""){
|
45
|
+
obj <- ggplot(data, aes(x=data[[opt$x_column]], y=data[[opt$y_column]], color=data[[opt$set_column]]))
|
46
|
+
}else{
|
47
|
+
obj <- ggplot(data, aes(x=data[[opt$x_column]], y=data[[opt$y_column]]))
|
48
|
+
}
|
49
|
+
if(opt$colours != ""){
|
50
|
+
colours <- unlist(strsplit(opt$colours, ','))
|
51
|
+
obj <- obj + scale_color_manual(values=c(colours))
|
52
|
+
}
|
53
|
+
|
54
|
+
if(opt$set_geom == 'point'){
|
55
|
+
obj <- obj + geom_point()
|
56
|
+
}else if(opt$set_geom == 'line'){
|
57
|
+
obj <- obj + geom_line()
|
58
|
+
}
|
59
|
+
|
60
|
+
obj <- obj + xlim(0, 1)
|
61
|
+
obj <- obj + ylim(0, 1)
|
62
|
+
|
63
|
+
if (opt$establish_limits){
|
64
|
+
obj <- obj + xlim(opt$x_limit, 1)
|
65
|
+
obj <- obj + ylim(0, opt$y_limit)
|
66
|
+
}
|
67
|
+
obj <- obj + xlab(opt$x_column)
|
68
|
+
obj <- obj + ylab(opt$y_column)
|
69
|
+
if(opt$no_legend){
|
70
|
+
obj <- obj + guides(color=FALSE)
|
71
|
+
}
|
72
|
+
obj
|
73
|
+
dev.off()
|
74
|
+
|
data/bin/merge_pairs.rb
ADDED
@@ -0,0 +1,139 @@
|
|
1
|
+
#! /usr/bin/env ruby
|
2
|
+
|
3
|
+
require 'optparse'
|
4
|
+
|
5
|
+
#################################
|
6
|
+
## METHODS
|
7
|
+
#################################
|
8
|
+
def load_pairs(file, key)
|
9
|
+
STDERR.puts key
|
10
|
+
pairsA = {}
|
11
|
+
pairsB = {}
|
12
|
+
File.open(file).each do |line|
|
13
|
+
line.chomp!
|
14
|
+
fields = line.split("\t")
|
15
|
+
if fields.first =~ /#{key}/#.include?(key)
|
16
|
+
save_record(pairsA, fields.last, fields.first )
|
17
|
+
else
|
18
|
+
save_record(pairsB, fields.last, fields.first )
|
19
|
+
end
|
20
|
+
end
|
21
|
+
return pairsA, pairsB
|
22
|
+
end
|
23
|
+
|
24
|
+
def save_record(hash, key, val)
|
25
|
+
query = hash[key]
|
26
|
+
if query.nil?
|
27
|
+
hash[key] = [val]
|
28
|
+
else
|
29
|
+
query << val
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
def generate_files(n_files, output)
|
34
|
+
files = []
|
35
|
+
n_files.times do |n|
|
36
|
+
files << File.open("#{output}#{n+1}.txt", 'w')
|
37
|
+
end
|
38
|
+
return files
|
39
|
+
end
|
40
|
+
|
41
|
+
def connect_pairs_write(pairsA, pairsB, n_files, files)
|
42
|
+
pairsA.each do |keyA, valA|
|
43
|
+
valB = pairsB[keyA]
|
44
|
+
if !valB.nil?
|
45
|
+
valA.each do |vA|
|
46
|
+
valB.each do |vB|
|
47
|
+
files[rand(n_files)].puts "#{vA}\t#{keyA}\t#{vB}"
|
48
|
+
end
|
49
|
+
end
|
50
|
+
end
|
51
|
+
end
|
52
|
+
end
|
53
|
+
|
54
|
+
def get_relations(pairsA, pairsB)
|
55
|
+
relations = {}
|
56
|
+
pairsA.each do |keyA, valA|
|
57
|
+
valB = pairsB[keyA]
|
58
|
+
if !valB.nil?
|
59
|
+
valA.each do |vA|
|
60
|
+
valB.each do |vB|
|
61
|
+
rel_key = vA + '_' + vB
|
62
|
+
query = relations[rel_key]
|
63
|
+
if query.nil?
|
64
|
+
relations[rel_key] = [keyA]
|
65
|
+
else
|
66
|
+
query << keyA
|
67
|
+
end
|
68
|
+
end
|
69
|
+
end
|
70
|
+
end
|
71
|
+
end
|
72
|
+
return relations
|
73
|
+
end
|
74
|
+
|
75
|
+
##############################
|
76
|
+
#OPTPARSE
|
77
|
+
##############################
|
78
|
+
|
79
|
+
options = {}
|
80
|
+
OptionParser.new do |opts|
|
81
|
+
opts.banner = "Usage: #{__FILE__} [options]"
|
82
|
+
|
83
|
+
options[:input_file] = nil
|
84
|
+
opts.on("-i", "--input_file PATH", "Input file for create adjacency matrix") do |input_file|
|
85
|
+
options[:input_file] = input_file
|
86
|
+
end
|
87
|
+
|
88
|
+
options[:key] = ''
|
89
|
+
opts.on("-k", "--key STRING", "String to split th two groups") do |key|
|
90
|
+
options[:key] = key
|
91
|
+
end
|
92
|
+
|
93
|
+
options[:output] = 'tri_'
|
94
|
+
opts.on("-o", "--output PATH", "Output network pairs") do |output|
|
95
|
+
options[:output] = output
|
96
|
+
end
|
97
|
+
|
98
|
+
options[:n_files] = 10
|
99
|
+
opts.on("-n", "--n_files INTEGER", "Split data onto n files") do |n|
|
100
|
+
options[:n_files] = n.to_i
|
101
|
+
end
|
102
|
+
|
103
|
+
options[:min_connections] = 1
|
104
|
+
opts.on("-m", "--min_connections INTEGER", "Minimun connections to take into account a relation") do |n|
|
105
|
+
options[:min_connections] = n.to_i
|
106
|
+
end
|
107
|
+
|
108
|
+
end.parse!
|
109
|
+
|
110
|
+
################################
|
111
|
+
## MAIN
|
112
|
+
################################
|
113
|
+
files = generate_files(options[:n_files], options[:output])
|
114
|
+
|
115
|
+
pairsA, pairsB = load_pairs(options[:input_file], options[:key])
|
116
|
+
if options[:min_connections] == 1
|
117
|
+
connect_pairs_write(pairsA, pairsB, options[:n_files], files)
|
118
|
+
else
|
119
|
+
STDERR.puts "MIN. NUMBER OF CONNECTIONS = #{options[:min_connections]}"
|
120
|
+
relations = get_relations(pairsA, pairsB)
|
121
|
+
count = 0
|
122
|
+
discarded = 0
|
123
|
+
relations.each do |rel, connections|
|
124
|
+
if connections.length >= options[:min_connections]
|
125
|
+
fields = rel.split('_')
|
126
|
+
connections.each do |con|
|
127
|
+
files[rand(options[:n_files])].puts "#{fields.first}\t#{con}\t#{fields.last}"
|
128
|
+
end
|
129
|
+
else
|
130
|
+
discarded += connections.length
|
131
|
+
end
|
132
|
+
count += connections.length
|
133
|
+
end
|
134
|
+
STDERR.puts "Relations: #{count}"
|
135
|
+
STDERR.puts "Discarded: #{discarded}"
|
136
|
+
end
|
137
|
+
files.each do |f|
|
138
|
+
f.close
|
139
|
+
end
|