pets 0.1.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +11 -0
- data/.rspec +3 -0
- data/.travis.yml +7 -0
- data/Gemfile +4 -0
- data/LICENSE.txt +21 -0
- data/README.md +41 -0
- data/Rakefile +6 -0
- data/bin/area_under_curve_pr.rb +118 -0
- data/bin/association_metrics_average.rb +94 -0
- data/bin/coPatReporter.rb +531 -0
- data/bin/console +14 -0
- data/bin/fmeasure_index.rb +72 -0
- data/bin/get_PR_values.rb +90 -0
- data/bin/get_clusters.R +18 -0
- data/bin/get_network_nodes.rb +197 -0
- data/bin/lines.R +77 -0
- data/bin/merge_by_cluster.rb +62 -0
- data/bin/merge_pairs.rb +138 -0
- data/bin/paco_translator.rb +102 -0
- data/bin/phen2reg.rb +385 -0
- data/bin/phen2reg_predictor_check.rb +297 -0
- data/bin/plot_area.R +71 -0
- data/bin/plot_boxplot.R +21 -0
- data/bin/plot_density.R +46 -0
- data/bin/plot_scatterplot.R +25 -0
- data/bin/reg2phen.rb +116 -0
- data/bin/region_to_patients_generator.rb +84 -0
- data/bin/relate_CI_to_association_value.rb +90 -0
- data/bin/setup +8 -0
- data/bin/standardize_scores.R +40 -0
- data/bin/xyplot_graph.R +60 -0
- data/external_data/biosystems_gene.gz +0 -0
- data/external_data/bsid2info.gz +0 -0
- data/external_data/chromosome_sizes_hg19.txt +24 -0
- data/external_data/gene_data.gz +0 -0
- data/external_data/gene_data_with_pathways.gz +0 -0
- data/external_data/gene_location.gz +0 -0
- data/external_data/hp.obo +146363 -0
- data/external_data/remove +0 -0
- data/lib/pets.rb +6 -0
- data/lib/pets/coPatReporterMethods.rb +77 -0
- data/lib/pets/generalMethods.rb +556 -0
- data/lib/pets/phen2reg_methods.rb +432 -0
- data/lib/pets/version.rb +3 -0
- data/pets.gemspec +47 -0
- data/templates/cohort_report.erb +93 -0
- data/templates/patient_report.erb +209 -0
- metadata +183 -0
@@ -0,0 +1,62 @@
|
|
1
|
+
#! /usr/bin/env ruby
|
2
|
+
#Tool to create the training file, taking as input the cluster_coords.txt file and phenotype_mutations_relations.txt
|
3
|
+
|
4
|
+
##########################
|
5
|
+
#RUBY GEMS
|
6
|
+
##########################
|
7
|
+
require 'optparse'
|
8
|
+
|
9
|
+
##########################
|
10
|
+
#METHODS
|
11
|
+
##########################
|
12
|
+
|
13
|
+
def load_cluster_file(cluster_file)
|
14
|
+
clusters_info = {}
|
15
|
+
File.open(cluster_file).each do |line|
|
16
|
+
line.chomp!
|
17
|
+
start, stop, chr, node = line.split("\t")
|
18
|
+
clusters_info[node] = [chr, start, stop]
|
19
|
+
end
|
20
|
+
return clusters_info
|
21
|
+
end
|
22
|
+
|
23
|
+
def obtain_training(relations_file, clusters, filter)
|
24
|
+
File.open(relations_file).each do |line|
|
25
|
+
line.chomp!
|
26
|
+
hpo, node, score = line.split("\t")
|
27
|
+
next if score.to_f.abs <= filter
|
28
|
+
clustersFileInfo = clusters[node]
|
29
|
+
puts "#{clustersFileInfo.join("\t")}\t#{hpo}\t#{score}\t#{node}"
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
##########################
|
34
|
+
#OPT-PARSE
|
35
|
+
##########################
|
36
|
+
options = {}
|
37
|
+
OptionParser.new do |opts|
|
38
|
+
opts.banner = "Usage: #{__FILE__} [options]"
|
39
|
+
|
40
|
+
options[:cluster_file] = nil
|
41
|
+
opts.on("-c", "--cluster_file PATH", "Input file with patient clusters") do |cluster_path|
|
42
|
+
options[:cluster_file] = cluster_path
|
43
|
+
end
|
44
|
+
|
45
|
+
options[:relations_file] = nil
|
46
|
+
opts.on("-n", "--relations_file PATH", "Input relations file from tripartite network") do |relations_file|
|
47
|
+
options[:relations_file] = relations_file
|
48
|
+
end
|
49
|
+
|
50
|
+
options[:filter_association] = 0
|
51
|
+
opts.on("-f", "--filter_minimun INTEGER", "Filter for association values") do |filter_association|
|
52
|
+
options[:filter_association] = filter_association.to_f
|
53
|
+
end
|
54
|
+
|
55
|
+
|
56
|
+
end.parse!
|
57
|
+
|
58
|
+
##########################
|
59
|
+
#MAIN
|
60
|
+
##########################
|
61
|
+
clusters = load_cluster_file(options[:cluster_file])
|
62
|
+
obtain_training(options[:relations_file], clusters, options[:filter_association])
|
data/bin/merge_pairs.rb
ADDED
@@ -0,0 +1,138 @@
|
|
1
|
+
#! /usr/bin/env ruby
|
2
|
+
|
3
|
+
require 'optparse'
|
4
|
+
|
5
|
+
#################################
|
6
|
+
## METHODS
|
7
|
+
#################################
|
8
|
+
def load_pairs(file, key)
|
9
|
+
pairsA = {}
|
10
|
+
pairsB = {}
|
11
|
+
File.open(file).each do |line|
|
12
|
+
line.chomp!
|
13
|
+
fields = line.split("\t")
|
14
|
+
if fields.first =~ /#{key}/#.include?(key)
|
15
|
+
save_record(pairsA, fields.last, fields.first )
|
16
|
+
else
|
17
|
+
save_record(pairsB, fields.last, fields.first )
|
18
|
+
end
|
19
|
+
end
|
20
|
+
return pairsA, pairsB
|
21
|
+
end
|
22
|
+
|
23
|
+
def save_record(hash, key, val)
|
24
|
+
query = hash[key]
|
25
|
+
if query.nil?
|
26
|
+
hash[key] = [val]
|
27
|
+
else
|
28
|
+
query << val
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
def generate_files(n_files, output)
|
33
|
+
files = []
|
34
|
+
n_files.times do |n|
|
35
|
+
files << File.open("#{output}#{n+1}.txt", 'w')
|
36
|
+
end
|
37
|
+
return files
|
38
|
+
end
|
39
|
+
|
40
|
+
def connect_pairs_write(pairsA, pairsB, n_files, files)
|
41
|
+
pairsA.each do |keyA, valA|
|
42
|
+
valB = pairsB[keyA]
|
43
|
+
if !valB.nil?
|
44
|
+
valA.each do |vA|
|
45
|
+
valB.each do |vB|
|
46
|
+
files[rand(n_files)].puts "#{vA}\t#{keyA}\t#{vB}"
|
47
|
+
end
|
48
|
+
end
|
49
|
+
end
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
53
|
+
def get_relations(pairsA, pairsB)
|
54
|
+
relations = {}
|
55
|
+
pairsA.each do |keyA, valA|
|
56
|
+
valB = pairsB[keyA]
|
57
|
+
if !valB.nil?
|
58
|
+
valA.each do |vA|
|
59
|
+
valB.each do |vB|
|
60
|
+
rel_key = vA + '_' + vB
|
61
|
+
query = relations[rel_key]
|
62
|
+
if query.nil?
|
63
|
+
relations[rel_key] = [keyA]
|
64
|
+
else
|
65
|
+
query << keyA
|
66
|
+
end
|
67
|
+
end
|
68
|
+
end
|
69
|
+
end
|
70
|
+
end
|
71
|
+
return relations
|
72
|
+
end
|
73
|
+
|
74
|
+
##############################
|
75
|
+
#OPTPARSE
|
76
|
+
##############################
|
77
|
+
|
78
|
+
options = {}
|
79
|
+
OptionParser.new do |opts|
|
80
|
+
opts.banner = "Usage: #{__FILE__} [options]"
|
81
|
+
|
82
|
+
options[:input_file] = nil
|
83
|
+
opts.on("-i", "--input_file PATH", "Input file for create adjacency matrix") do |input_file|
|
84
|
+
options[:input_file] = input_file
|
85
|
+
end
|
86
|
+
|
87
|
+
options[:key] = ''
|
88
|
+
opts.on("-k", "--key STRING", "String to split th two groups") do |key|
|
89
|
+
options[:key] = key
|
90
|
+
end
|
91
|
+
|
92
|
+
options[:output] = 'tri_'
|
93
|
+
opts.on("-o", "--output PATH", "Output network pairs") do |output|
|
94
|
+
options[:output] = output
|
95
|
+
end
|
96
|
+
|
97
|
+
options[:n_files] = 10
|
98
|
+
opts.on("-n", "--n_files INTEGER", "Split data onto n files") do |n|
|
99
|
+
options[:n_files] = n.to_i
|
100
|
+
end
|
101
|
+
|
102
|
+
options[:min_connections] = 1
|
103
|
+
opts.on("-m", "--min_connections INTEGER", "Minimun connections to take into account a relation") do |n|
|
104
|
+
options[:min_connections] = n.to_i
|
105
|
+
end
|
106
|
+
|
107
|
+
end.parse!
|
108
|
+
|
109
|
+
################################
|
110
|
+
## MAIN
|
111
|
+
################################
|
112
|
+
files = generate_files(options[:n_files], options[:output])
|
113
|
+
|
114
|
+
pairsA, pairsB = load_pairs(options[:input_file], options[:key])
|
115
|
+
if options[:min_connections] == 1
|
116
|
+
connect_pairs_write(pairsA, pairsB, options[:n_files], files)
|
117
|
+
else
|
118
|
+
STDERR.puts "MIN. NUMBER OF CONNECTIONS = #{options[:min_connections]}"
|
119
|
+
relations = get_relations(pairsA, pairsB)
|
120
|
+
count = 0
|
121
|
+
discarded = 0
|
122
|
+
relations.each do |rel, connections|
|
123
|
+
if connections.length >= options[:min_connections]
|
124
|
+
fields = rel.split('_')
|
125
|
+
connections.each do |con|
|
126
|
+
files[rand(options[:n_files])].puts "#{fields.first}\t#{con}\t#{fields.last}"
|
127
|
+
end
|
128
|
+
else
|
129
|
+
discarded += connections.length
|
130
|
+
end
|
131
|
+
count += connections.length
|
132
|
+
end
|
133
|
+
STDERR.puts "Relations: #{count}"
|
134
|
+
STDERR.puts "Discarded: #{discarded}"
|
135
|
+
end
|
136
|
+
files.each do |f|
|
137
|
+
f.close
|
138
|
+
end
|
@@ -0,0 +1,102 @@
|
|
1
|
+
#! /usr/bin/env ruby
|
2
|
+
|
3
|
+
ROOT_PATH = File.dirname(__FILE__)
|
4
|
+
EXTERNAL_DATA = File.expand_path(File.join(ROOT_PATH, '..', 'external_data'))
|
5
|
+
HPO_FILE = File.join(EXTERNAL_DATA, 'hp.obo')
|
6
|
+
$: << File.expand_path(File.join(ROOT_PATH, '..', 'lib', 'gephepred'))
|
7
|
+
|
8
|
+
require 'generalMethods.rb'
|
9
|
+
require 'optparse'
|
10
|
+
|
11
|
+
###############
|
12
|
+
#METHODS
|
13
|
+
###############
|
14
|
+
|
15
|
+
def translate_codes_to_terms(patient_data, hpo_storage)
|
16
|
+
patients_with_hpo_names = {}
|
17
|
+
hpo_names = []
|
18
|
+
patient_data.each do |patientID, hpos_and_cnvs|
|
19
|
+
hpos = hpos_and_cnvs.shift.split('|')
|
20
|
+
hpos.each do |hpo|
|
21
|
+
hpo_names << hpo_storage[hpo][1]
|
22
|
+
end
|
23
|
+
hpos_and_cnvs << hpo_names.join('|')
|
24
|
+
patients_with_hpo_names[patientID] = hpos_and_cnvs
|
25
|
+
hpo_names = []
|
26
|
+
end
|
27
|
+
return patients_with_hpo_names
|
28
|
+
end
|
29
|
+
|
30
|
+
def save_translated_file(patients_with_hpo_names, output_file)
|
31
|
+
handler = File.open(output_file, 'w')
|
32
|
+
patients_with_hpo_names.each do |id, data|
|
33
|
+
patientID = id.gsub(/_i[0-9]/,'')
|
34
|
+
handler.puts "#{patientID}\t#{data.join("\t")}"
|
35
|
+
end
|
36
|
+
handler.close
|
37
|
+
end
|
38
|
+
|
39
|
+
###############
|
40
|
+
#OPTIONS
|
41
|
+
###############
|
42
|
+
|
43
|
+
options = {}
|
44
|
+
OptionParser.new do |opts|
|
45
|
+
opts.banner = "Usage: #{__FILE__} [options]"
|
46
|
+
|
47
|
+
options[:chromosome_col] = nil
|
48
|
+
opts.on("-c", "--chromosome_col INTEGER/STRING", "Column name if header is true, otherwise 0-based position of the column with the chromosome") do |data|
|
49
|
+
options[:chromosome_col] = data
|
50
|
+
end
|
51
|
+
|
52
|
+
options[:pat_id_col] = nil
|
53
|
+
opts.on("-d", "--pat_id_col INTEGER/STRING", "Column name if header is true, otherwise 0-based position of the column with the patient id") do |data|
|
54
|
+
options[:pat_id_col] = data
|
55
|
+
end
|
56
|
+
|
57
|
+
options[:end_col] = nil
|
58
|
+
opts.on("-e", "--end_col INTEGER/STRING", "Column name if header is true, otherwise 0-based position of the column with the end mutation coordinate") do |data|
|
59
|
+
options[:end_col] = data
|
60
|
+
end
|
61
|
+
|
62
|
+
options[:header] = true
|
63
|
+
opts.on("-H", "--header", "Set if the file has a line header. Default true") do
|
64
|
+
options[:header] = false
|
65
|
+
end
|
66
|
+
|
67
|
+
options[:output_file] = 'paco_file_with_hpo_names.txt'
|
68
|
+
opts.on("-o", "--output_file PATH", "Output paco file with HPO names") do |data|
|
69
|
+
options[:output_file] = data
|
70
|
+
end
|
71
|
+
|
72
|
+
options[:input_file] = nil
|
73
|
+
opts.on("-P", "--input_file PATH", "Input file with PACO extension") do |value|
|
74
|
+
options[:input_file] = value
|
75
|
+
end
|
76
|
+
|
77
|
+
options[:hpo_col] = nil
|
78
|
+
opts.on("-p", "--hpo_term_col INTEGER/STRING", "Column name if header true or 0-based position of the column with the HPO terms") do |data|
|
79
|
+
options[:hpo_col] = data
|
80
|
+
end
|
81
|
+
|
82
|
+
options[:start_col] = nil
|
83
|
+
opts.on("-s", "--start_col INTEGER/STRING", "Column name if header is true, otherwise 0-based position of the column with the start mutation coordinate") do |data|
|
84
|
+
options[:start_col] = data
|
85
|
+
end
|
86
|
+
|
87
|
+
|
88
|
+
end.parse!
|
89
|
+
|
90
|
+
|
91
|
+
###############
|
92
|
+
#MAIN
|
93
|
+
###############
|
94
|
+
|
95
|
+
hpo_storage = load_hpo_file(HPO_FILE)
|
96
|
+
patient_data, $patient_number = load_patient_cohort(options)
|
97
|
+
patients_with_hpo_names = translate_codes_to_terms(patient_data, hpo_storage)
|
98
|
+
|
99
|
+
save_translated_file(patients_with_hpo_names, options[:output_file])
|
100
|
+
|
101
|
+
|
102
|
+
Process.exit
|
data/bin/phen2reg.rb
ADDED
@@ -0,0 +1,385 @@
|
|
1
|
+
#! /usr/bin/env ruby
|
2
|
+
# Rojano E. & Seoane P., September 2016
|
3
|
+
# Program to predict the position from given HPO codes, sorted by their association values.
|
4
|
+
|
5
|
+
REPORT_FOLDER=File.expand_path(File.join(File.dirname(__FILE__), '..', 'templates'))
|
6
|
+
ROOT_PATH = File.dirname(__FILE__)
|
7
|
+
$: << File.expand_path(File.join(ROOT_PATH, '..', 'lib', 'gephepred'))
|
8
|
+
require 'net/ftp'
|
9
|
+
require 'net/http'
|
10
|
+
require 'zlib'
|
11
|
+
require 'json'
|
12
|
+
require 'generalMethods.rb'
|
13
|
+
require 'phen2reg_methods.rb'
|
14
|
+
require 'optparse'
|
15
|
+
require 'report_html'
|
16
|
+
|
17
|
+
|
18
|
+
##########################
|
19
|
+
#METHODS
|
20
|
+
##########################
|
21
|
+
|
22
|
+
def calculate_hpo_recovery_and_filter(adjacent_regions_joined, patient_original_phenotypes, predicted_hpo_percentage, min_hpo_recovery_percentage, patient_number)
|
23
|
+
records_to_delete = []
|
24
|
+
counter = 0
|
25
|
+
adjacent_regions_joined.each do |chr, start, stop, hpo_list, association_values, score|
|
26
|
+
hpo_coincidences = patient_original_phenotypes & hpo_list
|
27
|
+
original_hpo_recovery_percentage = hpo_coincidences.length / patient_original_phenotypes.length.to_f * 100
|
28
|
+
records_to_delete << counter if original_hpo_recovery_percentage < min_hpo_recovery_percentage
|
29
|
+
query = predicted_hpo_percentage[patient_number]
|
30
|
+
if query.nil?
|
31
|
+
predicted_hpo_percentage[patient_number] = [original_hpo_recovery_percentage]
|
32
|
+
else
|
33
|
+
query << original_hpo_recovery_percentage
|
34
|
+
end
|
35
|
+
counter += 1
|
36
|
+
end
|
37
|
+
records_to_delete.reverse_each do |record_number|
|
38
|
+
adjacent_regions_joined.delete_at(record_number)
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
42
|
+
def download(ftp_server, path, name)
|
43
|
+
ftp = Net::FTP.new()
|
44
|
+
ftp.connect(ftp_server)
|
45
|
+
ftp.login
|
46
|
+
ftp.getbinaryfile(path, name)
|
47
|
+
ftp.close
|
48
|
+
end
|
49
|
+
|
50
|
+
##########################
|
51
|
+
#OPT-PARSER
|
52
|
+
##########################
|
53
|
+
|
54
|
+
options = {}
|
55
|
+
OptionParser.new do |opts|
|
56
|
+
opts.banner = "Usage: #{__FILE__} [options]"
|
57
|
+
options[:best_thresold] = 1.5
|
58
|
+
opts.on("-b", "--best_thresold FLOAT", "Association value thresold") do |best_thresold|
|
59
|
+
options[:best_thresold] = best_thresold.to_f
|
60
|
+
end
|
61
|
+
|
62
|
+
options[:freedom_degree] = 'prednum'
|
63
|
+
opts.on("-d", "--freedom_degree STRING", "Type of freedom degree calculation: prednum, phennum, maxnum") do |fd|
|
64
|
+
options[:freedom_degree] = fd
|
65
|
+
end
|
66
|
+
|
67
|
+
options[:html_file] = "patient_profile_report.html"
|
68
|
+
opts.on("-F", "--html_file PATH", "HTML file with patient information HPO profile summary") do |html_file|
|
69
|
+
options[:html_file] = html_file
|
70
|
+
end
|
71
|
+
|
72
|
+
options[:hpo_file] = nil
|
73
|
+
opts.on("-f", "--hpo_file PATH", "Input hp.obo file") do |hpo_file|
|
74
|
+
options[:hpo_file] = hpo_file
|
75
|
+
end
|
76
|
+
|
77
|
+
options[:information_coefficient] = nil
|
78
|
+
opts.on("-i", "--information_coefficient PATH", "Input file with information coefficients") do |information_coefficient|
|
79
|
+
options[:information_coefficient] = information_coefficient
|
80
|
+
end
|
81
|
+
|
82
|
+
options[:retrieve_kegg_data] = false
|
83
|
+
opts.on('-k', "--retrieve_kegg_data", "Add KEGG data to prediction report") do
|
84
|
+
options[:retrieve_kegg_data] = true
|
85
|
+
end
|
86
|
+
|
87
|
+
options[:print_matrix] = false
|
88
|
+
opts.on('-m', "--print_matrix", "Print output matrix") do
|
89
|
+
options[:print_matrix] = true
|
90
|
+
end
|
91
|
+
|
92
|
+
options[:max_number] = 10
|
93
|
+
opts.on("-M", "--max_number INTEGER", "Max number of regions to take into account") do |max_number|
|
94
|
+
options[:max_number] = max_number.to_i
|
95
|
+
end
|
96
|
+
|
97
|
+
options[:hpo_is_name] = false
|
98
|
+
opts.on("-n", "--hpo_is_name", "Set this flag if phenotypes are given as names instead of codes") do
|
99
|
+
options[:hpo_is_name] = true
|
100
|
+
end
|
101
|
+
|
102
|
+
options[:output_quality_control] = "output_quality_control.txt"
|
103
|
+
opts.on("-O", "--output_quality_control PATH", "Output file with quality control of all input HPOs") do |output_quality_control|
|
104
|
+
options[:output_quality_control] = output_quality_control
|
105
|
+
end
|
106
|
+
|
107
|
+
options[:output_matrix] = 'output_matrix.txt'
|
108
|
+
opts.on("-o", "--output_matrix PATH", "Output matrix file, with associations for each input HPO") do |output_matrix|
|
109
|
+
options[:output_matrix] = output_matrix
|
110
|
+
end
|
111
|
+
|
112
|
+
options[:prediction_data] = nil
|
113
|
+
#chr\tstart\tstop
|
114
|
+
opts.on("-p", "--prediction_file PATH", "Input data with HPO codes for predicting their location. It can be either, a file path or string with HPO separated by pipes (|)") do |input_path|
|
115
|
+
options[:prediction_data] = input_path
|
116
|
+
end
|
117
|
+
|
118
|
+
options[:pvalue_cutoff] = 0.1
|
119
|
+
opts.on("-P", "--pvalue_cutoff FLOAT", "P-value cutoff") do |pvalue_cutoff|
|
120
|
+
options[:pvalue_cutoff] = pvalue_cutoff.to_f
|
121
|
+
end
|
122
|
+
|
123
|
+
options[:quality_control] = true
|
124
|
+
opts.on("-Q", "--no_quality_control", "Disable quality control") do
|
125
|
+
options[:quality_control] = false
|
126
|
+
end
|
127
|
+
|
128
|
+
options[:ranking_style] = ''
|
129
|
+
opts.on("-r", "--ranking_style STRING", "Ranking style: mean, fisher, geommean") do |ranking_style|
|
130
|
+
options[:ranking_style] = ranking_style
|
131
|
+
end
|
132
|
+
|
133
|
+
options[:write_hpo_recovery_file] = true
|
134
|
+
opts.on("-s", "--write_hpo_recovery_file", "Disable write hpo recovery file") do
|
135
|
+
options[:write_hpo_recovery_file] = false
|
136
|
+
end
|
137
|
+
|
138
|
+
options[:group_by_region] = true
|
139
|
+
opts.on("-S", "--group_by_region", "Disable prediction which HPOs are located in the same region") do
|
140
|
+
options[:group_by_region] = false
|
141
|
+
end
|
142
|
+
|
143
|
+
options[:html_reporting] = true
|
144
|
+
opts.on("-T", "--no_html_reporting", "Disable html reporting") do
|
145
|
+
options[:html_reporting] = false
|
146
|
+
end
|
147
|
+
|
148
|
+
options[:training_file] = nil
|
149
|
+
#chr\tstart\tstop\tphenotype\tassociation_value
|
150
|
+
opts.on("-t", "--training_file PATH", "Input training file, with association values") do |training_path|
|
151
|
+
options[:training_file] = training_path
|
152
|
+
end
|
153
|
+
|
154
|
+
options[:multiple_profile] = false
|
155
|
+
opts.on("-u", "--multiple_profile", "Set if multiple profiles") do
|
156
|
+
options[:multiple_profile] = true
|
157
|
+
end
|
158
|
+
|
159
|
+
options[:hpo_recovery] = 50
|
160
|
+
opts.on("-y", "--hpo_recovery INTEGER", "Minimum percentage of HPO terms to consider predictions") do |hpo_recovery|
|
161
|
+
options[:hpo_recovery] = hpo_recovery.to_f
|
162
|
+
end
|
163
|
+
|
164
|
+
end.parse!
|
165
|
+
|
166
|
+
##########################
|
167
|
+
#PATHS
|
168
|
+
##########################
|
169
|
+
all_paths = {code: File.join(File.dirname(__FILE__), '..')}
|
170
|
+
all_paths[:external_data] = File.join(all_paths[:code], 'external_data')
|
171
|
+
all_paths[:gene_data] = File.join(all_paths[:external_data], 'gene_data.gz')
|
172
|
+
all_paths[:biosystems_gene] = File.join(all_paths[:external_data], 'biosystems_gene.gz')
|
173
|
+
all_paths[:biosystems_info] = File.join(all_paths[:external_data], 'bsid2info.gz')
|
174
|
+
all_paths[:gene_data_with_pathways] = File.join(all_paths[:external_data], 'gene_data_with_pathways.gz')
|
175
|
+
all_paths[:gene_location] = File.join(all_paths[:external_data], 'gene_location.gz')
|
176
|
+
|
177
|
+
##########################
|
178
|
+
#DOWNLOADS
|
179
|
+
##########################
|
180
|
+
sources = [
|
181
|
+
['ftp.ncbi.nlm.nih.gov', 'genomes/H_sapiens/ARCHIVE/ANNOTATION_RELEASE.105/GFF/ref_GRCh37.p13_top_level.gff3.gz', all_paths[:gene_data]],
|
182
|
+
['ftp.ncbi.nlm.nih.gov', 'pub/biosystems/CURRENT/biosystems_gene.gz', all_paths[:biosystems_gene]],
|
183
|
+
['ftp.ncbi.nlm.nih.gov', 'pub/biosystems/CURRENT/bsid2info.gz', all_paths[:biosystems_info]]
|
184
|
+
]
|
185
|
+
sources.each do |server, path, output|
|
186
|
+
download(server, path, output) if !File.exists?(output)
|
187
|
+
end
|
188
|
+
|
189
|
+
##########################
|
190
|
+
#MAIN
|
191
|
+
##########################
|
192
|
+
|
193
|
+
if File.exist?(options[:prediction_data])
|
194
|
+
if !options[:multiple_profile]
|
195
|
+
options[:prediction_data] = [File.open(options[:prediction_data]).readlines.map!{|line| line.chomp}]
|
196
|
+
#STDERR.puts options[:prediction_data].inspect
|
197
|
+
else
|
198
|
+
multiple_profiles = []
|
199
|
+
File.open(options[:prediction_data]).each do |line|
|
200
|
+
line.chomp!
|
201
|
+
multiple_profiles << line.split('|')
|
202
|
+
end
|
203
|
+
options[:prediction_data] = multiple_profiles
|
204
|
+
end
|
205
|
+
else
|
206
|
+
# if you want to add phenotypes through the terminal
|
207
|
+
if !options[:multiple_profile]
|
208
|
+
options[:prediction_data] = [options[:prediction_data].split('|')]
|
209
|
+
else
|
210
|
+
options[:prediction_data] = options[:prediction_data].split('!').map{|profile| profile.split('|')}
|
211
|
+
end
|
212
|
+
end
|
213
|
+
|
214
|
+
##########################
|
215
|
+
#- Loading data
|
216
|
+
|
217
|
+
hpo_storage = load_hpo_file(options[:hpo_file])
|
218
|
+
if options[:quality_control]
|
219
|
+
hpo_child_metadata = get_child_parent_relations(hpo_storage)
|
220
|
+
hpos_ci_values = load_hpo_ci_values(options[:information_coefficient])
|
221
|
+
end
|
222
|
+
|
223
|
+
genes_with_kegg = {}
|
224
|
+
gene_location = {}
|
225
|
+
if options[:retrieve_kegg_data]
|
226
|
+
if !File.exists?(all_paths[:gene_data_with_pathways]) || !File.exists?(all_paths[:gene_location])
|
227
|
+
gene_list, gene_location = load_gene_data(all_paths[:gene_data])
|
228
|
+
### kegg_data = parse_kegg_data(genes_found_attributes.keys)
|
229
|
+
kegg_data = parse_kegg_from_biosystems(all_paths[:biosystems_gene], all_paths[:biosystems_info])
|
230
|
+
genes_with_kegg = merge_genes_with_kegg_data(gene_list, kegg_data)
|
231
|
+
write_compressed_plain_file(genes_with_kegg, all_paths[:gene_data_with_pathways])
|
232
|
+
write_compressed_plain_file(gene_location, all_paths[:gene_location])
|
233
|
+
else
|
234
|
+
gene_location = read_compressed_json(all_paths[:gene_location])
|
235
|
+
genes_with_kegg = read_compressed_json(all_paths[:gene_data_with_pathways])
|
236
|
+
end
|
237
|
+
end
|
238
|
+
|
239
|
+
# hpo_dictionary = load_hpo_dictionary_name2code(options[:hpo2name_file]) if options[:hpo_is_name]
|
240
|
+
trainingData = load_training_file4HPO(options[:training_file], options[:best_thresold])
|
241
|
+
|
242
|
+
##########################
|
243
|
+
#- HPO PROFILE ANALYSIS
|
244
|
+
|
245
|
+
phenotypes_by_patient = {}
|
246
|
+
predicted_hpo_percentage = {}
|
247
|
+
options[:prediction_data].each_with_index do |patient_hpo_profile, patient_number|
|
248
|
+
phenotypes_by_patient[patient_number] = patient_hpo_profile
|
249
|
+
# STDERR.puts patient_hpo_profile.inspect
|
250
|
+
if options[:hpo_is_name]
|
251
|
+
translated_hpos = []
|
252
|
+
hpo_dictionary = create_hpo_dictionary(hpo_storage)
|
253
|
+
patient_hpo_profile.each_with_index do |name, i|
|
254
|
+
hpo_code = hpo_dictionary[name]
|
255
|
+
if hpo_code.nil?
|
256
|
+
#STDERR.puts "Warning! Invalid HPO name: #{name}"
|
257
|
+
hpo_code = nil
|
258
|
+
end
|
259
|
+
patient_hpo_profile[i] = hpo_code
|
260
|
+
end
|
261
|
+
patient_hpo_profile.compact!
|
262
|
+
end
|
263
|
+
|
264
|
+
#HPO quality control
|
265
|
+
#---------------------------
|
266
|
+
characterised_hpos = []
|
267
|
+
#hpo_metadata = []
|
268
|
+
if options[:quality_control]
|
269
|
+
#characterised_hpos, hpo_metadata = hpo_quality_control(options[:prediction_data], options[:hpo2name_file], options[:information_coefficient])
|
270
|
+
# characterised_hpos, hpo_storage = hpo_quality_control(patient_hpo_profile, hpo_storage, hpo_child_metadata, hpos_ci_values)
|
271
|
+
characterised_hpos = hpo_quality_control(patient_hpo_profile, hpo_storage, hpo_child_metadata, hpos_ci_values)
|
272
|
+
output_quality_control = File.open(options[:output_quality_control], "w")
|
273
|
+
header = ["HPO name", "HPO code", "Exists?", "CI value", "Is child of", "Childs"]
|
274
|
+
output_quality_control.puts Terminal::Table.new :headings => header, :rows => characterised_hpos
|
275
|
+
output_quality_control.close
|
276
|
+
end
|
277
|
+
|
278
|
+
#Prediction steps
|
279
|
+
#---------------------------
|
280
|
+
hpo_regions = search4HPO(patient_hpo_profile, trainingData)
|
281
|
+
if hpo_regions.empty?
|
282
|
+
puts "ProfID:#{patient_number}\tResults not found"
|
283
|
+
elsif options[:group_by_region] == false
|
284
|
+
hpo_regions.each do |hpo, regions|
|
285
|
+
regions.each do |region|
|
286
|
+
puts "ProfID:#{patient_number}\t#{hpo}\t#{region.join("\t")}"
|
287
|
+
end
|
288
|
+
end
|
289
|
+
elsif options[:group_by_region] == true
|
290
|
+
region2hpo, regionAttributes, association_scores = group_by_region(hpo_regions)
|
291
|
+
#STDERR.puts patient_hpo_profile.inspect
|
292
|
+
#add_parentals_of_not_found_hpos_in_regions(patient_hpo_profile, trainingData, region2hpo, regionAttributes, association_scores, hpo_metadata)
|
293
|
+
#STDERR.puts patient_hpo_profile.inspect
|
294
|
+
null_value = 0
|
295
|
+
hpo_region_matrix = generate_hpo_region_matrix(region2hpo, association_scores, patient_hpo_profile, null_value)
|
296
|
+
if options[:print_matrix]
|
297
|
+
output_matrix = File.open(options[:output_matrix] + "_#{patient_number}", "w")
|
298
|
+
output_matrix.puts "Region\t#{patient_hpo_profile.join("\t")}"
|
299
|
+
regionAttributes_array = regionAttributes.values
|
300
|
+
hpo_region_matrix.each_with_index do |association_values, i|
|
301
|
+
chr, start, stop = regionAttributes_array[i]
|
302
|
+
output_matrix.puts "#{chr}:#{start}-#{stop}\t#{association_values.join("\t")}"
|
303
|
+
end
|
304
|
+
output_matrix.close
|
305
|
+
end
|
306
|
+
|
307
|
+
|
308
|
+
scoring_regions(regionAttributes, hpo_region_matrix, options[:ranking_style], options[:pvalue_cutoff], options[:freedom_degree], null_value)
|
309
|
+
if regionAttributes.empty?
|
310
|
+
puts "ProfID:#{patient_number}\tResults not found"
|
311
|
+
else
|
312
|
+
adjacent_regions_joined = []
|
313
|
+
regionAttributes.each do |regionID, attributes|
|
314
|
+
chr, start, stop, patient_ID, region_length, score = attributes
|
315
|
+
association_values = association_scores[regionID]
|
316
|
+
adjacent_regions_joined << [chr, start, stop, association_values.keys, association_values.values, score]
|
317
|
+
end
|
318
|
+
adjacent_regions_joined = join_regions(adjacent_regions_joined) # MOVER A ANTES DE CONSTRUIR LA MATRIZ
|
319
|
+
|
320
|
+
#Ranking
|
321
|
+
if options[:ranking_style] == 'fisher'
|
322
|
+
adjacent_regions_joined.sort!{|r1, r2| r1.last <=> r2.last}
|
323
|
+
else
|
324
|
+
adjacent_regions_joined.sort!{|r1, r2| r2.last <=> r1.last}
|
325
|
+
end
|
326
|
+
patient_original_phenotypes = phenotypes_by_patient[patient_number]
|
327
|
+
calculate_hpo_recovery_and_filter(adjacent_regions_joined, patient_original_phenotypes, predicted_hpo_percentage, options[:hpo_recovery], patient_number)
|
328
|
+
if adjacent_regions_joined.empty?
|
329
|
+
puts "ProfID:#{patient_number}\tResults not found"
|
330
|
+
else
|
331
|
+
adjacent_regions_joined = adjacent_regions_joined[0..options[:max_number]-1] if !options[:max_number].nil?
|
332
|
+
adjacent_regions_joined.each do |chr, start, stop, hpo_list, association_values, score|
|
333
|
+
puts "ProfID:#{patient_number}\t#{chr}\t#{start}\t#{stop}\t#{hpo_list.join(',')}\t#{association_values.join(',')}\t#{score}"
|
334
|
+
end
|
335
|
+
end
|
336
|
+
end
|
337
|
+
end #elsif
|
338
|
+
|
339
|
+
pathway_stats = {}
|
340
|
+
if options[:retrieve_kegg_data]
|
341
|
+
genes_found = []
|
342
|
+
genes_found_attributes = {}
|
343
|
+
adjacent_regions_joined.each do |adjacent_region|
|
344
|
+
ref_chr, ref_start, ref_stop = adjacent_region
|
345
|
+
chr_genes = gene_location[ref_chr]
|
346
|
+
genes = []
|
347
|
+
chr_genes.each do |gene_name, gene_start, gene_stop|
|
348
|
+
if (ref_start > gene_start && ref_stop < gene_stop) ||
|
349
|
+
(ref_start < gene_start && ref_stop > gene_stop) ||
|
350
|
+
(ref_start < gene_start && ref_stop > gene_start) ||
|
351
|
+
(ref_start < gene_stop && ref_stop > gene_stop)
|
352
|
+
genes << gene_name
|
353
|
+
end
|
354
|
+
end
|
355
|
+
genes_found << genes
|
356
|
+
end
|
357
|
+
|
358
|
+
genes_with_kegg_data = []
|
359
|
+
genes_found.each do |genes|
|
360
|
+
genes_cluster = []
|
361
|
+
genes.each do |gene|
|
362
|
+
query = genes_with_kegg[gene]
|
363
|
+
genes_cluster << [gene, query]
|
364
|
+
end
|
365
|
+
genes_with_kegg_data << genes_cluster
|
366
|
+
end
|
367
|
+
pathway_stats = compute_pathway_enrichment(genes_with_kegg_data, genes_with_kegg)
|
368
|
+
pathway_stats.sort!{|p1, p2| p1.last <=> p2.last}
|
369
|
+
end
|
370
|
+
|
371
|
+
#Creating html report
|
372
|
+
#-------------------
|
373
|
+
####PLEASE CHECK THIS METHOD!
|
374
|
+
report_data(characterised_hpos, adjacent_regions_joined, options[:html_file], hpo_storage, genes_with_kegg_data, pathway_stats) if options[:html_reporting]
|
375
|
+
end # end each_with_index
|
376
|
+
|
377
|
+
if options[:write_hpo_recovery_file]
|
378
|
+
handler = File.open('output_profile_recovery', 'w')
|
379
|
+
predicted_hpo_percentage.each do |patient, percentage|
|
380
|
+
percentage.each do |perc|
|
381
|
+
handler.puts "ProfID:#{patient}\t#{perc.inspect}"
|
382
|
+
end
|
383
|
+
end
|
384
|
+
handler.close
|
385
|
+
end
|