DomFun 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +11 -0
- data/.rspec +3 -0
- data/.travis.yml +7 -0
- data/DomFun.gemspec +44 -0
- data/Gemfile +4 -0
- data/LICENSE.txt +21 -0
- data/README.md +39 -0
- data/Rakefile +6 -0
- data/bin/add_protein_functional_families.rb +133 -0
- data/bin/console +14 -0
- data/bin/domains_to_function_predictor.rb +287 -0
- data/bin/generate_CAFA2_dataset.rb +135 -0
- data/bin/generate_CAFA2_tripartite_network.rb +139 -0
- data/bin/generate_cafa_control.rb +45 -0
- data/bin/get_kegg_pathways.R +12 -0
- data/bin/lines.R +74 -0
- data/bin/merge_pairs.rb +139 -0
- data/bin/normalize_combined_scores.rb +118 -0
- data/bin/prepare_cafa_network.rb +96 -0
- data/bin/setup +8 -0
- data/bin/standardize_scores.R +53 -0
- data/bin/translate_kegg_genes2pathways.rb +98 -0
- data/bin/validate_ProtFunSys_predictions.rb +174 -0
- data/lib/DomFun.rb +6 -0
- data/lib/DomFun/generalMethods.rb +105 -0
- data/lib/DomFun/version.rb +3 -0
- metadata +128 -0
@@ -0,0 +1,135 @@
|
|
1
|
+
#! /usr/bin/env ruby
|
2
|
+
|
3
|
+
require 'optparse'
|
4
|
+
|
5
|
+
##########################
|
6
|
+
#MODULES
|
7
|
+
#########################
|
8
|
+
|
9
|
+
module Enumerable
|
10
|
+
|
11
|
+
def sum
|
12
|
+
self.inject(0){|accum, i| accum + i }
|
13
|
+
end
|
14
|
+
|
15
|
+
def mean
|
16
|
+
self.sum/self.length.to_f
|
17
|
+
end
|
18
|
+
|
19
|
+
def sample_variance
|
20
|
+
m = self.mean
|
21
|
+
sum = self.inject(0){|accum, i| accum +(i-m)**2 }
|
22
|
+
sum/(self.length - 1).to_f
|
23
|
+
end
|
24
|
+
|
25
|
+
def standard_deviation
|
26
|
+
Math.sqrt(self.sample_variance)
|
27
|
+
end
|
28
|
+
|
29
|
+
end
|
30
|
+
|
31
|
+
##########################
|
32
|
+
#METHODS
|
33
|
+
#########################
|
34
|
+
# Corregir el input de combined scores
|
35
|
+
|
36
|
+
def load_predictions(input_file)
|
37
|
+
predictions = {}
|
38
|
+
File.open(input_file).each do |line|
|
39
|
+
line.chomp!
|
40
|
+
proteinID, domains, go_term, p_value = line.split("\t")
|
41
|
+
query = predictions[proteinID]
|
42
|
+
if query.nil?
|
43
|
+
predictions[proteinID] = [[go_term, p_value.to_f]]
|
44
|
+
else
|
45
|
+
query << [go_term, p_value.to_f]
|
46
|
+
end
|
47
|
+
end
|
48
|
+
return predictions
|
49
|
+
end
|
50
|
+
|
51
|
+
def translate_protein_ids(cafa_file, predictions)
|
52
|
+
cafa_predictions = []
|
53
|
+
File.open(cafa_file).each do |line|
|
54
|
+
line.chomp!
|
55
|
+
cafa_data = line.split("\t")
|
56
|
+
cafaID = cafa_data[3]
|
57
|
+
proteinID = cafa_data[6]
|
58
|
+
go_associations = predictions[proteinID]
|
59
|
+
unless go_associations.nil?
|
60
|
+
go_associations.each do |goID, value|
|
61
|
+
cafa_predictions << [cafaID, goID, value]
|
62
|
+
end
|
63
|
+
end
|
64
|
+
end
|
65
|
+
return cafa_predictions
|
66
|
+
end
|
67
|
+
|
68
|
+
def normalize_association_values(cafa_predictions)
|
69
|
+
raw_association_values = []
|
70
|
+
normalized_prediction_values = []
|
71
|
+
cafa_predictions.each do |protID, goTerm, value|
|
72
|
+
raw_association_values << value.to_f
|
73
|
+
end
|
74
|
+
raw_values_mean = raw_association_values.mean
|
75
|
+
raw_values_sd = raw_association_values.standard_deviation
|
76
|
+
association_values_minus_mean = []
|
77
|
+
raw_association_values.each do |value|
|
78
|
+
z_score = (value - raw_values_mean).fdiv(raw_values_sd)
|
79
|
+
if z_score > 2
|
80
|
+
z_score = 2
|
81
|
+
elsif z_score < -2
|
82
|
+
z_score = -2
|
83
|
+
end
|
84
|
+
z_score = z_score.fdiv(5) + 0.5
|
85
|
+
association_values_minus_mean << z_score
|
86
|
+
end
|
87
|
+
cafa_predictions.each_with_index do |info, c|
|
88
|
+
protID, goTerm, association_values = info
|
89
|
+
normalized_prediction_values << [protID, goTerm, association_values_minus_mean[c]]
|
90
|
+
end
|
91
|
+
cafa_predictions = normalized_prediction_values
|
92
|
+
return cafa_predictions
|
93
|
+
end
|
94
|
+
|
95
|
+
##########################
|
96
|
+
#OPT-PARSER
|
97
|
+
##########################
|
98
|
+
options = {}
|
99
|
+
OptionParser.new do |opts|
|
100
|
+
opts.banner = "Usage: #{__FILE__} [options]"
|
101
|
+
|
102
|
+
options[:input_predictions] = nil
|
103
|
+
opts.on("-a", "--input_predictions PATH", "Input predictions file") do |input_predictions|
|
104
|
+
options[:input_predictions] = input_predictions
|
105
|
+
end
|
106
|
+
|
107
|
+
options[:input_cafa] = nil
|
108
|
+
opts.on("-c", "--input_cafa PATH", "Input CAFA file to translate UniProtIDs to CAFAIDs") do |input_cafa|
|
109
|
+
options[:input_cafa] = input_cafa
|
110
|
+
end
|
111
|
+
|
112
|
+
options[:output_file] = 'results_to_CAFA2_validation.txt'
|
113
|
+
opts.on("-o", "--output_file PATH", "Output file") do |output_file|
|
114
|
+
options[:output_file] = output_file
|
115
|
+
end
|
116
|
+
|
117
|
+
options[:do_norm] = false
|
118
|
+
opts.on("-n", "--do_norm", "Normalize prediction values") do
|
119
|
+
options[:do_norm] = true
|
120
|
+
end
|
121
|
+
|
122
|
+
end.parse!
|
123
|
+
|
124
|
+
##########################
|
125
|
+
#MAIN
|
126
|
+
##########################
|
127
|
+
|
128
|
+
predictions = load_predictions(options[:input_predictions])
|
129
|
+
cafa_predictions = translate_protein_ids(options[:input_cafa], predictions)
|
130
|
+
cafa_predictions = normalize_association_values(cafa_predictions) if options[:do_norm]
|
131
|
+
|
132
|
+
handler = File.open(options[:output_file], 'w')
|
133
|
+
cafa_predictions.each do |info|
|
134
|
+
handler.puts "#{info.join("\t")}"
|
135
|
+
end
|
@@ -0,0 +1,139 @@
|
|
1
|
+
#! /usr/bin/env ruby
|
2
|
+
##########################
|
3
|
+
# Rojano E. & Seoane P., July 2019
|
4
|
+
# Generate training and testing datasets from CAFA2 and UniProt data.
|
5
|
+
# This script translate identifiers and generate the file for the
|
6
|
+
# construction of tripartite network (domain-protein-FunSys)
|
7
|
+
##########################
|
8
|
+
|
9
|
+
REPORT_FOLDER=File.expand_path(File.join(File.dirname(__FILE__), '..', 'templates'))
|
10
|
+
ROOT_PATH = File.dirname(__FILE__)
|
11
|
+
$: << File.expand_path(File.join(ROOT_PATH, '..', 'lib', 'DomFun'))
|
12
|
+
require 'generalMethods.rb'
|
13
|
+
require 'optparse'
|
14
|
+
require 'csv'
|
15
|
+
|
16
|
+
##########################
|
17
|
+
#METHODS
|
18
|
+
##########################
|
19
|
+
|
20
|
+
def load_hash(filename, mode)
|
21
|
+
container = {}
|
22
|
+
File.open(filename).each do |line|
|
23
|
+
line.chomp!
|
24
|
+
key, value = line.split("\t") if mode == 'a'
|
25
|
+
value, key = line.split("\t") if mode == 'b'
|
26
|
+
query = container[key]
|
27
|
+
if query.nil?
|
28
|
+
container[key] = [value]
|
29
|
+
else
|
30
|
+
query << value
|
31
|
+
end
|
32
|
+
end
|
33
|
+
return container
|
34
|
+
end
|
35
|
+
|
36
|
+
def build_tripartite_network(domain_tuples, annot_tuples, filename)
|
37
|
+
handler = File.open(filename, 'w')
|
38
|
+
annot_tuples.each do |protID, annots|
|
39
|
+
annots.each do |a|
|
40
|
+
handler.puts "#{a}\t#{protID}"
|
41
|
+
end
|
42
|
+
end
|
43
|
+
domain_tuples.each do |protID, doms|
|
44
|
+
doms.each do |d|
|
45
|
+
handler.puts "#{d}\t#{protID}"
|
46
|
+
end
|
47
|
+
end
|
48
|
+
handler.close
|
49
|
+
end
|
50
|
+
|
51
|
+
def load_cath_info(cath_file, domain_class)
|
52
|
+
proteins_2_domains = {}
|
53
|
+
File.open(cath_file).each do |line|
|
54
|
+
line.chomp!
|
55
|
+
protID, superfam, funfam = line.split("\t")
|
56
|
+
query = proteins_2_domains[protID]
|
57
|
+
if domain_class == 'superfamilyID'
|
58
|
+
domain_type = superfam
|
59
|
+
else
|
60
|
+
domain_type = funfam
|
61
|
+
end
|
62
|
+
if query.nil?
|
63
|
+
proteins_2_domains[protID] = [domain_type]
|
64
|
+
else
|
65
|
+
query << domain_type
|
66
|
+
end
|
67
|
+
|
68
|
+
end
|
69
|
+
return proteins_2_domains
|
70
|
+
end
|
71
|
+
|
72
|
+
def generate_tuples(prot_annots, prot_domains)
|
73
|
+
domain_tuples = {}
|
74
|
+
annot_tuples = {}
|
75
|
+
prot_annots.each do |protID, annots|
|
76
|
+
domains = prot_domains[protID]
|
77
|
+
unless domains.nil?
|
78
|
+
domains.each do |d|
|
79
|
+
query = domain_tuples[protID]
|
80
|
+
if query.nil?
|
81
|
+
domain_tuples[protID] = [d]
|
82
|
+
else
|
83
|
+
query << d unless query.include?(d)
|
84
|
+
end
|
85
|
+
end
|
86
|
+
end
|
87
|
+
annots.each do |a|
|
88
|
+
query = annot_tuples[protID]
|
89
|
+
if query.nil?
|
90
|
+
annot_tuples[protID] = [a]
|
91
|
+
else
|
92
|
+
query << a unless query.include?(a)
|
93
|
+
end
|
94
|
+
end
|
95
|
+
end
|
96
|
+
return domain_tuples, annot_tuples
|
97
|
+
end
|
98
|
+
|
99
|
+
##########################
|
100
|
+
#OPT-PARSER
|
101
|
+
##########################
|
102
|
+
options = {}
|
103
|
+
OptionParser.new do |opts|
|
104
|
+
opts.banner = "Usage: #{__FILE__} [options]"
|
105
|
+
|
106
|
+
options[:protein_domains] = nil
|
107
|
+
opts.on("-a", "--protein_domains PATH", "Training proteins with CATH domains") do |data|
|
108
|
+
options[:protein_domains] = data
|
109
|
+
end
|
110
|
+
|
111
|
+
options[:annotated_proteins] = nil
|
112
|
+
opts.on("-b", "--annotated_proteins PATH", "Training proteins with annotations") do |data|
|
113
|
+
options[:annotated_proteins] = data
|
114
|
+
end
|
115
|
+
|
116
|
+
options[:domain_class] = 'funfamID'
|
117
|
+
opts.on("-d", "--domain_class STRING", "Domain identifiers type. Please choose funfamID or superfamilyID") do |data|
|
118
|
+
options[:domain_class] = data
|
119
|
+
end
|
120
|
+
|
121
|
+
options[:output_network] = 'tripartite_network.txt'
|
122
|
+
opts.on("-o", "--output_network PATH", "Output tripartite network from CAFA information") do |data|
|
123
|
+
options[:output_network] = data
|
124
|
+
end
|
125
|
+
|
126
|
+
opts.on_tail("-h", "--help", "Tool information") do
|
127
|
+
puts opts
|
128
|
+
exit
|
129
|
+
end
|
130
|
+
|
131
|
+
end.parse!
|
132
|
+
|
133
|
+
##########################
|
134
|
+
#MAIN
|
135
|
+
##########################
|
136
|
+
proteins_with_annotations = load_hash(options[:annotated_proteins], 'a')
|
137
|
+
proteins_2_domains = load_cath_info(options[:protein_domains], options[:domain_class])
|
138
|
+
domain_tuples, annot_tuples = generate_tuples(proteins_with_annotations, proteins_2_domains)
|
139
|
+
build_tripartite_network(domain_tuples, annot_tuples, options[:output_network])
|
@@ -0,0 +1,45 @@
|
|
1
|
+
#! /usr/bin/env ruby
|
2
|
+
##########################
|
3
|
+
# Rojano E. & Seoane P., July 2019
|
4
|
+
# Generate control from CAFA data (proteins and GO terms)
|
5
|
+
##########################
|
6
|
+
|
7
|
+
REPORT_FOLDER=File.expand_path(File.join(File.dirname(__FILE__), '..', 'templates'))
|
8
|
+
ROOT_PATH = File.dirname(__FILE__)
|
9
|
+
$: << File.expand_path(File.join(ROOT_PATH, '..', 'lib', 'DomFun'))
|
10
|
+
require 'optparse'
|
11
|
+
require 'generalMethods.rb'
|
12
|
+
|
13
|
+
##########################
|
14
|
+
#OPT-PARSER
|
15
|
+
##########################
|
16
|
+
options = {}
|
17
|
+
OptionParser.new do |opts|
|
18
|
+
opts.banner = "Usage: #{__FILE__} [options]"
|
19
|
+
|
20
|
+
options[:input_cafa] = nil
|
21
|
+
opts.on("-a", "--input_cafa PATH", "Input CAFA file") do |data|
|
22
|
+
options[:input_cafa] = data
|
23
|
+
end
|
24
|
+
|
25
|
+
options[:output_file] = 'cafa_control.txt'
|
26
|
+
opts.on("-o", "--output_file PATH", "Output control from CAFA data") do |data|
|
27
|
+
options[:output_file] = data
|
28
|
+
end
|
29
|
+
|
30
|
+
opts.on_tail("-h", "--help", "Tool information") do
|
31
|
+
puts opts
|
32
|
+
exit
|
33
|
+
end
|
34
|
+
|
35
|
+
end.parse!
|
36
|
+
|
37
|
+
##########################
|
38
|
+
#MAIN
|
39
|
+
##########################
|
40
|
+
cafa_data = load_cafa_data(options[:input_cafa])
|
41
|
+
handler = File.open(options[:output_file], 'w')
|
42
|
+
cafa_data.each do |protein, goTerms|
|
43
|
+
handler.puts "#{protein}\t#{goTerms.join(';')}"
|
44
|
+
end
|
45
|
+
handler.close
|
@@ -0,0 +1,12 @@
|
|
1
|
+
#! /usr/bin/env Rscript
|
2
|
+
|
3
|
+
# Get KEGG pathways from hsa identifiers.
|
4
|
+
|
5
|
+
###################
|
6
|
+
# LIBRARIES
|
7
|
+
###################
|
8
|
+
|
9
|
+
library(KEGGREST)
|
10
|
+
res <- keggLink("pathway", "hsa")
|
11
|
+
write.table(cbind(names(res), res), col.names=FALSE, quote=FALSE, sep="\t", row.names=FALSE, file = "kegg_pathways.txt")
|
12
|
+
|
data/bin/lines.R
ADDED
@@ -0,0 +1,74 @@
|
|
1
|
+
#! /usr/bin/env Rscript
|
2
|
+
|
3
|
+
library(ggplot2)
|
4
|
+
library(optparse)
|
5
|
+
|
6
|
+
#####################
|
7
|
+
## OPTPARSE
|
8
|
+
#####################
|
9
|
+
option_list <- list(
|
10
|
+
make_option(c("-d", "--data_file"), type="character",
|
11
|
+
help="Tabulated file with information about each sample"),
|
12
|
+
make_option(c("-o", "--output"), type="character", default="results",
|
13
|
+
help="Output figure file"),
|
14
|
+
make_option(c("-x", "--x_column"), type="character",
|
15
|
+
help="Name of column to be used for X dimension"),
|
16
|
+
make_option(c("-y", "--y_column"), type="character",
|
17
|
+
help="Name of column to be used for Y dimension"),
|
18
|
+
make_option(c("-s", "--set_column"), type="character", default="",
|
19
|
+
help="Name of column to be used on set groups"),
|
20
|
+
make_option(c("-L", "--no_legend"), action="store_true", default=FALSE,
|
21
|
+
help="Remove legend"),
|
22
|
+
make_option(c("-c", "--colours"), type="character", default="",
|
23
|
+
help="Define which color is asigned to each data series. List colours comma separated."),
|
24
|
+
make_option(c("-m", "--set_geom"), type="character", default="line",
|
25
|
+
help="Choose the type of graphical representation, using points or lines"),
|
26
|
+
make_option(c("-e", "--establish_limits"), action="store_true", default=FALSE,
|
27
|
+
help="Allow establishing limits for X and Y axis. If true, please set x_limit and y_limit"),
|
28
|
+
make_option(c("-X", "--x_limit"), type="integer", default=0,
|
29
|
+
help="Set x axis limit"),
|
30
|
+
make_option(c("-Y", "--y_limit"), type="integer", default=1,
|
31
|
+
help="Set y axis limit")
|
32
|
+
|
33
|
+
)
|
34
|
+
opt <- parse_args(OptionParser(option_list=option_list))
|
35
|
+
|
36
|
+
|
37
|
+
################################################################
|
38
|
+
## MAIN
|
39
|
+
################################################################
|
40
|
+
|
41
|
+
data <- read.table(opt$data_file, sep="\t", header=TRUE)
|
42
|
+
|
43
|
+
pdf(paste(opt$output, '.pdf', sep=""))
|
44
|
+
if(opt$set_column != ""){
|
45
|
+
obj <- ggplot(data, aes(x=data[[opt$x_column]], y=data[[opt$y_column]], color=data[[opt$set_column]]))
|
46
|
+
}else{
|
47
|
+
obj <- ggplot(data, aes(x=data[[opt$x_column]], y=data[[opt$y_column]]))
|
48
|
+
}
|
49
|
+
if(opt$colours != ""){
|
50
|
+
colours <- unlist(strsplit(opt$colours, ','))
|
51
|
+
obj <- obj + scale_color_manual(values=c(colours))
|
52
|
+
}
|
53
|
+
|
54
|
+
if(opt$set_geom == 'point'){
|
55
|
+
obj <- obj + geom_point()
|
56
|
+
}else if(opt$set_geom == 'line'){
|
57
|
+
obj <- obj + geom_line()
|
58
|
+
}
|
59
|
+
|
60
|
+
obj <- obj + xlim(0, 1)
|
61
|
+
obj <- obj + ylim(0, 1)
|
62
|
+
|
63
|
+
if (opt$establish_limits){
|
64
|
+
obj <- obj + xlim(opt$x_limit, 1)
|
65
|
+
obj <- obj + ylim(0, opt$y_limit)
|
66
|
+
}
|
67
|
+
obj <- obj + xlab(opt$x_column)
|
68
|
+
obj <- obj + ylab(opt$y_column)
|
69
|
+
if(opt$no_legend){
|
70
|
+
obj <- obj + guides(color=FALSE)
|
71
|
+
}
|
72
|
+
obj
|
73
|
+
dev.off()
|
74
|
+
|
data/bin/merge_pairs.rb
ADDED
@@ -0,0 +1,139 @@
|
|
1
|
+
#! /usr/bin/env ruby
|
2
|
+
|
3
|
+
require 'optparse'
|
4
|
+
|
5
|
+
#################################
|
6
|
+
## METHODS
|
7
|
+
#################################
|
8
|
+
def load_pairs(file, key)
|
9
|
+
STDERR.puts key
|
10
|
+
pairsA = {}
|
11
|
+
pairsB = {}
|
12
|
+
File.open(file).each do |line|
|
13
|
+
line.chomp!
|
14
|
+
fields = line.split("\t")
|
15
|
+
if fields.first =~ /#{key}/#.include?(key)
|
16
|
+
save_record(pairsA, fields.last, fields.first )
|
17
|
+
else
|
18
|
+
save_record(pairsB, fields.last, fields.first )
|
19
|
+
end
|
20
|
+
end
|
21
|
+
return pairsA, pairsB
|
22
|
+
end
|
23
|
+
|
24
|
+
def save_record(hash, key, val)
|
25
|
+
query = hash[key]
|
26
|
+
if query.nil?
|
27
|
+
hash[key] = [val]
|
28
|
+
else
|
29
|
+
query << val
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
def generate_files(n_files, output)
|
34
|
+
files = []
|
35
|
+
n_files.times do |n|
|
36
|
+
files << File.open("#{output}#{n+1}.txt", 'w')
|
37
|
+
end
|
38
|
+
return files
|
39
|
+
end
|
40
|
+
|
41
|
+
def connect_pairs_write(pairsA, pairsB, n_files, files)
|
42
|
+
pairsA.each do |keyA, valA|
|
43
|
+
valB = pairsB[keyA]
|
44
|
+
if !valB.nil?
|
45
|
+
valA.each do |vA|
|
46
|
+
valB.each do |vB|
|
47
|
+
files[rand(n_files)].puts "#{vA}\t#{keyA}\t#{vB}"
|
48
|
+
end
|
49
|
+
end
|
50
|
+
end
|
51
|
+
end
|
52
|
+
end
|
53
|
+
|
54
|
+
def get_relations(pairsA, pairsB)
|
55
|
+
relations = {}
|
56
|
+
pairsA.each do |keyA, valA|
|
57
|
+
valB = pairsB[keyA]
|
58
|
+
if !valB.nil?
|
59
|
+
valA.each do |vA|
|
60
|
+
valB.each do |vB|
|
61
|
+
rel_key = vA + '_' + vB
|
62
|
+
query = relations[rel_key]
|
63
|
+
if query.nil?
|
64
|
+
relations[rel_key] = [keyA]
|
65
|
+
else
|
66
|
+
query << keyA
|
67
|
+
end
|
68
|
+
end
|
69
|
+
end
|
70
|
+
end
|
71
|
+
end
|
72
|
+
return relations
|
73
|
+
end
|
74
|
+
|
75
|
+
##############################
|
76
|
+
#OPTPARSE
|
77
|
+
##############################
|
78
|
+
|
79
|
+
options = {}
|
80
|
+
OptionParser.new do |opts|
|
81
|
+
opts.banner = "Usage: #{__FILE__} [options]"
|
82
|
+
|
83
|
+
options[:input_file] = nil
|
84
|
+
opts.on("-i", "--input_file PATH", "Input file for create adjacency matrix") do |input_file|
|
85
|
+
options[:input_file] = input_file
|
86
|
+
end
|
87
|
+
|
88
|
+
options[:key] = ''
|
89
|
+
opts.on("-k", "--key STRING", "String to split th two groups") do |key|
|
90
|
+
options[:key] = key
|
91
|
+
end
|
92
|
+
|
93
|
+
options[:output] = 'tri_'
|
94
|
+
opts.on("-o", "--output PATH", "Output network pairs") do |output|
|
95
|
+
options[:output] = output
|
96
|
+
end
|
97
|
+
|
98
|
+
options[:n_files] = 10
|
99
|
+
opts.on("-n", "--n_files INTEGER", "Split data onto n files") do |n|
|
100
|
+
options[:n_files] = n.to_i
|
101
|
+
end
|
102
|
+
|
103
|
+
options[:min_connections] = 1
|
104
|
+
opts.on("-m", "--min_connections INTEGER", "Minimun connections to take into account a relation") do |n|
|
105
|
+
options[:min_connections] = n.to_i
|
106
|
+
end
|
107
|
+
|
108
|
+
end.parse!
|
109
|
+
|
110
|
+
################################
|
111
|
+
## MAIN
|
112
|
+
################################
|
113
|
+
files = generate_files(options[:n_files], options[:output])
|
114
|
+
|
115
|
+
pairsA, pairsB = load_pairs(options[:input_file], options[:key])
|
116
|
+
if options[:min_connections] == 1
|
117
|
+
connect_pairs_write(pairsA, pairsB, options[:n_files], files)
|
118
|
+
else
|
119
|
+
STDERR.puts "MIN. NUMBER OF CONNECTIONS = #{options[:min_connections]}"
|
120
|
+
relations = get_relations(pairsA, pairsB)
|
121
|
+
count = 0
|
122
|
+
discarded = 0
|
123
|
+
relations.each do |rel, connections|
|
124
|
+
if connections.length >= options[:min_connections]
|
125
|
+
fields = rel.split('_')
|
126
|
+
connections.each do |con|
|
127
|
+
files[rand(options[:n_files])].puts "#{fields.first}\t#{con}\t#{fields.last}"
|
128
|
+
end
|
129
|
+
else
|
130
|
+
discarded += connections.length
|
131
|
+
end
|
132
|
+
count += connections.length
|
133
|
+
end
|
134
|
+
STDERR.puts "Relations: #{count}"
|
135
|
+
STDERR.puts "Discarded: #{discarded}"
|
136
|
+
end
|
137
|
+
files.each do |f|
|
138
|
+
f.close
|
139
|
+
end
|