pets 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +11 -0
- data/.rspec +3 -0
- data/.travis.yml +7 -0
- data/Gemfile +4 -0
- data/LICENSE.txt +21 -0
- data/README.md +41 -0
- data/Rakefile +6 -0
- data/bin/area_under_curve_pr.rb +118 -0
- data/bin/association_metrics_average.rb +94 -0
- data/bin/coPatReporter.rb +531 -0
- data/bin/console +14 -0
- data/bin/fmeasure_index.rb +72 -0
- data/bin/get_PR_values.rb +90 -0
- data/bin/get_clusters.R +18 -0
- data/bin/get_network_nodes.rb +197 -0
- data/bin/lines.R +77 -0
- data/bin/merge_by_cluster.rb +62 -0
- data/bin/merge_pairs.rb +138 -0
- data/bin/paco_translator.rb +102 -0
- data/bin/phen2reg.rb +385 -0
- data/bin/phen2reg_predictor_check.rb +297 -0
- data/bin/plot_area.R +71 -0
- data/bin/plot_boxplot.R +21 -0
- data/bin/plot_density.R +46 -0
- data/bin/plot_scatterplot.R +25 -0
- data/bin/reg2phen.rb +116 -0
- data/bin/region_to_patients_generator.rb +84 -0
- data/bin/relate_CI_to_association_value.rb +90 -0
- data/bin/setup +8 -0
- data/bin/standardize_scores.R +40 -0
- data/bin/xyplot_graph.R +60 -0
- data/external_data/biosystems_gene.gz +0 -0
- data/external_data/bsid2info.gz +0 -0
- data/external_data/chromosome_sizes_hg19.txt +24 -0
- data/external_data/gene_data.gz +0 -0
- data/external_data/gene_data_with_pathways.gz +0 -0
- data/external_data/gene_location.gz +0 -0
- data/external_data/hp.obo +146363 -0
- data/external_data/remove +0 -0
- data/lib/pets.rb +6 -0
- data/lib/pets/coPatReporterMethods.rb +77 -0
- data/lib/pets/generalMethods.rb +556 -0
- data/lib/pets/phen2reg_methods.rb +432 -0
- data/lib/pets/version.rb +3 -0
- data/pets.gemspec +47 -0
- data/templates/cohort_report.erb +93 -0
- data/templates/patient_report.erb +209 -0
- metadata +183 -0
@@ -0,0 +1,62 @@
|
|
1
|
+
#! /usr/bin/env ruby
|
2
|
+
#Tool to create the training file, taking as input the cluster_coords.txt file and phenotype_mutations_relations.txt
|
3
|
+
|
4
|
+
##########################
|
5
|
+
#RUBY GEMS
|
6
|
+
##########################
|
7
|
+
require 'optparse'
|
8
|
+
|
9
|
+
##########################
|
10
|
+
#METHODS
|
11
|
+
##########################
|
12
|
+
|
13
|
+
def load_cluster_file(cluster_file)
|
14
|
+
clusters_info = {}
|
15
|
+
File.open(cluster_file).each do |line|
|
16
|
+
line.chomp!
|
17
|
+
start, stop, chr, node = line.split("\t")
|
18
|
+
clusters_info[node] = [chr, start, stop]
|
19
|
+
end
|
20
|
+
return clusters_info
|
21
|
+
end
|
22
|
+
|
23
|
+
def obtain_training(relations_file, clusters, filter)
|
24
|
+
File.open(relations_file).each do |line|
|
25
|
+
line.chomp!
|
26
|
+
hpo, node, score = line.split("\t")
|
27
|
+
next if score.to_f.abs <= filter
|
28
|
+
clustersFileInfo = clusters[node]
|
29
|
+
puts "#{clustersFileInfo.join("\t")}\t#{hpo}\t#{score}\t#{node}"
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
##########################
|
34
|
+
#OPT-PARSE
|
35
|
+
##########################
|
36
|
+
options = {}
|
37
|
+
OptionParser.new do |opts|
|
38
|
+
opts.banner = "Usage: #{__FILE__} [options]"
|
39
|
+
|
40
|
+
options[:cluster_file] = nil
|
41
|
+
opts.on("-c", "--cluster_file PATH", "Input file with patient clusters") do |cluster_path|
|
42
|
+
options[:cluster_file] = cluster_path
|
43
|
+
end
|
44
|
+
|
45
|
+
options[:relations_file] = nil
|
46
|
+
opts.on("-n", "--relations_file PATH", "Input relations file from tripartite network") do |relations_file|
|
47
|
+
options[:relations_file] = relations_file
|
48
|
+
end
|
49
|
+
|
50
|
+
options[:filter_association] = 0
|
51
|
+
opts.on("-f", "--filter_minimun INTEGER", "Filter for association values") do |filter_association|
|
52
|
+
options[:filter_association] = filter_association.to_f
|
53
|
+
end
|
54
|
+
|
55
|
+
|
56
|
+
end.parse!
|
57
|
+
|
58
|
+
##########################
|
59
|
+
#MAIN
|
60
|
+
##########################
|
61
|
+
clusters = load_cluster_file(options[:cluster_file])
|
62
|
+
obtain_training(options[:relations_file], clusters, options[:filter_association])
|
data/bin/merge_pairs.rb
ADDED
@@ -0,0 +1,138 @@
|
|
1
|
+
#! /usr/bin/env ruby
|
2
|
+
|
3
|
+
require 'optparse'
|
4
|
+
|
5
|
+
#################################
|
6
|
+
## METHODS
|
7
|
+
#################################
|
8
|
+
def load_pairs(file, key)
|
9
|
+
pairsA = {}
|
10
|
+
pairsB = {}
|
11
|
+
File.open(file).each do |line|
|
12
|
+
line.chomp!
|
13
|
+
fields = line.split("\t")
|
14
|
+
if fields.first =~ /#{key}/#.include?(key)
|
15
|
+
save_record(pairsA, fields.last, fields.first )
|
16
|
+
else
|
17
|
+
save_record(pairsB, fields.last, fields.first )
|
18
|
+
end
|
19
|
+
end
|
20
|
+
return pairsA, pairsB
|
21
|
+
end
|
22
|
+
|
23
|
+
def save_record(hash, key, val)
|
24
|
+
query = hash[key]
|
25
|
+
if query.nil?
|
26
|
+
hash[key] = [val]
|
27
|
+
else
|
28
|
+
query << val
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
def generate_files(n_files, output)
|
33
|
+
files = []
|
34
|
+
n_files.times do |n|
|
35
|
+
files << File.open("#{output}#{n+1}.txt", 'w')
|
36
|
+
end
|
37
|
+
return files
|
38
|
+
end
|
39
|
+
|
40
|
+
def connect_pairs_write(pairsA, pairsB, n_files, files)
|
41
|
+
pairsA.each do |keyA, valA|
|
42
|
+
valB = pairsB[keyA]
|
43
|
+
if !valB.nil?
|
44
|
+
valA.each do |vA|
|
45
|
+
valB.each do |vB|
|
46
|
+
files[rand(n_files)].puts "#{vA}\t#{keyA}\t#{vB}"
|
47
|
+
end
|
48
|
+
end
|
49
|
+
end
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
53
|
+
def get_relations(pairsA, pairsB)
|
54
|
+
relations = {}
|
55
|
+
pairsA.each do |keyA, valA|
|
56
|
+
valB = pairsB[keyA]
|
57
|
+
if !valB.nil?
|
58
|
+
valA.each do |vA|
|
59
|
+
valB.each do |vB|
|
60
|
+
rel_key = vA + '_' + vB
|
61
|
+
query = relations[rel_key]
|
62
|
+
if query.nil?
|
63
|
+
relations[rel_key] = [keyA]
|
64
|
+
else
|
65
|
+
query << keyA
|
66
|
+
end
|
67
|
+
end
|
68
|
+
end
|
69
|
+
end
|
70
|
+
end
|
71
|
+
return relations
|
72
|
+
end
|
73
|
+
|
74
|
+
##############################
|
75
|
+
#OPTPARSE
|
76
|
+
##############################
|
77
|
+
|
78
|
+
options = {}
|
79
|
+
OptionParser.new do |opts|
|
80
|
+
opts.banner = "Usage: #{__FILE__} [options]"
|
81
|
+
|
82
|
+
options[:input_file] = nil
|
83
|
+
opts.on("-i", "--input_file PATH", "Input file for create adjacency matrix") do |input_file|
|
84
|
+
options[:input_file] = input_file
|
85
|
+
end
|
86
|
+
|
87
|
+
options[:key] = ''
|
88
|
+
opts.on("-k", "--key STRING", "String to split th two groups") do |key|
|
89
|
+
options[:key] = key
|
90
|
+
end
|
91
|
+
|
92
|
+
options[:output] = 'tri_'
|
93
|
+
opts.on("-o", "--output PATH", "Output network pairs") do |output|
|
94
|
+
options[:output] = output
|
95
|
+
end
|
96
|
+
|
97
|
+
options[:n_files] = 10
|
98
|
+
opts.on("-n", "--n_files INTEGER", "Split data onto n files") do |n|
|
99
|
+
options[:n_files] = n.to_i
|
100
|
+
end
|
101
|
+
|
102
|
+
options[:min_connections] = 1
|
103
|
+
opts.on("-m", "--min_connections INTEGER", "Minimun connections to take into account a relation") do |n|
|
104
|
+
options[:min_connections] = n.to_i
|
105
|
+
end
|
106
|
+
|
107
|
+
end.parse!
|
108
|
+
|
109
|
+
################################
|
110
|
+
## MAIN
|
111
|
+
################################
|
112
|
+
files = generate_files(options[:n_files], options[:output])
|
113
|
+
|
114
|
+
pairsA, pairsB = load_pairs(options[:input_file], options[:key])
|
115
|
+
if options[:min_connections] == 1
|
116
|
+
connect_pairs_write(pairsA, pairsB, options[:n_files], files)
|
117
|
+
else
|
118
|
+
STDERR.puts "MIN. NUMBER OF CONNECTIONS = #{options[:min_connections]}"
|
119
|
+
relations = get_relations(pairsA, pairsB)
|
120
|
+
count = 0
|
121
|
+
discarded = 0
|
122
|
+
relations.each do |rel, connections|
|
123
|
+
if connections.length >= options[:min_connections]
|
124
|
+
fields = rel.split('_')
|
125
|
+
connections.each do |con|
|
126
|
+
files[rand(options[:n_files])].puts "#{fields.first}\t#{con}\t#{fields.last}"
|
127
|
+
end
|
128
|
+
else
|
129
|
+
discarded += connections.length
|
130
|
+
end
|
131
|
+
count += connections.length
|
132
|
+
end
|
133
|
+
STDERR.puts "Relations: #{count}"
|
134
|
+
STDERR.puts "Discarded: #{discarded}"
|
135
|
+
end
|
136
|
+
files.each do |f|
|
137
|
+
f.close
|
138
|
+
end
|
@@ -0,0 +1,102 @@
|
|
1
|
+
#! /usr/bin/env ruby
|
2
|
+
|
3
|
+
ROOT_PATH = File.dirname(__FILE__)
|
4
|
+
EXTERNAL_DATA = File.expand_path(File.join(ROOT_PATH, '..', 'external_data'))
|
5
|
+
HPO_FILE = File.join(EXTERNAL_DATA, 'hp.obo')
|
6
|
+
$: << File.expand_path(File.join(ROOT_PATH, '..', 'lib', 'gephepred'))
|
7
|
+
|
8
|
+
require 'generalMethods.rb'
|
9
|
+
require 'optparse'
|
10
|
+
|
11
|
+
###############
|
12
|
+
#METHODS
|
13
|
+
###############
|
14
|
+
|
15
|
+
def translate_codes_to_terms(patient_data, hpo_storage)
|
16
|
+
patients_with_hpo_names = {}
|
17
|
+
hpo_names = []
|
18
|
+
patient_data.each do |patientID, hpos_and_cnvs|
|
19
|
+
hpos = hpos_and_cnvs.shift.split('|')
|
20
|
+
hpos.each do |hpo|
|
21
|
+
hpo_names << hpo_storage[hpo][1]
|
22
|
+
end
|
23
|
+
hpos_and_cnvs << hpo_names.join('|')
|
24
|
+
patients_with_hpo_names[patientID] = hpos_and_cnvs
|
25
|
+
hpo_names = []
|
26
|
+
end
|
27
|
+
return patients_with_hpo_names
|
28
|
+
end
|
29
|
+
|
30
|
+
def save_translated_file(patients_with_hpo_names, output_file)
|
31
|
+
handler = File.open(output_file, 'w')
|
32
|
+
patients_with_hpo_names.each do |id, data|
|
33
|
+
patientID = id.gsub(/_i[0-9]/,'')
|
34
|
+
handler.puts "#{patientID}\t#{data.join("\t")}"
|
35
|
+
end
|
36
|
+
handler.close
|
37
|
+
end
|
38
|
+
|
39
|
+
###############
|
40
|
+
#OPTIONS
|
41
|
+
###############
|
42
|
+
|
43
|
+
options = {}
|
44
|
+
OptionParser.new do |opts|
|
45
|
+
opts.banner = "Usage: #{__FILE__} [options]"
|
46
|
+
|
47
|
+
options[:chromosome_col] = nil
|
48
|
+
opts.on("-c", "--chromosome_col INTEGER/STRING", "Column name if header is true, otherwise 0-based position of the column with the chromosome") do |data|
|
49
|
+
options[:chromosome_col] = data
|
50
|
+
end
|
51
|
+
|
52
|
+
options[:pat_id_col] = nil
|
53
|
+
opts.on("-d", "--pat_id_col INTEGER/STRING", "Column name if header is true, otherwise 0-based position of the column with the patient id") do |data|
|
54
|
+
options[:pat_id_col] = data
|
55
|
+
end
|
56
|
+
|
57
|
+
options[:end_col] = nil
|
58
|
+
opts.on("-e", "--end_col INTEGER/STRING", "Column name if header is true, otherwise 0-based position of the column with the end mutation coordinate") do |data|
|
59
|
+
options[:end_col] = data
|
60
|
+
end
|
61
|
+
|
62
|
+
options[:header] = true
|
63
|
+
opts.on("-H", "--header", "Set if the file has a line header. Default true") do
|
64
|
+
options[:header] = false
|
65
|
+
end
|
66
|
+
|
67
|
+
options[:output_file] = 'paco_file_with_hpo_names.txt'
|
68
|
+
opts.on("-o", "--output_file PATH", "Output paco file with HPO names") do |data|
|
69
|
+
options[:output_file] = data
|
70
|
+
end
|
71
|
+
|
72
|
+
options[:input_file] = nil
|
73
|
+
opts.on("-P", "--input_file PATH", "Input file with PACO extension") do |value|
|
74
|
+
options[:input_file] = value
|
75
|
+
end
|
76
|
+
|
77
|
+
options[:hpo_col] = nil
|
78
|
+
opts.on("-p", "--hpo_term_col INTEGER/STRING", "Column name if header true or 0-based position of the column with the HPO terms") do |data|
|
79
|
+
options[:hpo_col] = data
|
80
|
+
end
|
81
|
+
|
82
|
+
options[:start_col] = nil
|
83
|
+
opts.on("-s", "--start_col INTEGER/STRING", "Column name if header is true, otherwise 0-based position of the column with the start mutation coordinate") do |data|
|
84
|
+
options[:start_col] = data
|
85
|
+
end
|
86
|
+
|
87
|
+
|
88
|
+
end.parse!
|
89
|
+
|
90
|
+
|
91
|
+
###############
|
92
|
+
#MAIN
|
93
|
+
###############
|
94
|
+
|
95
|
+
hpo_storage = load_hpo_file(HPO_FILE)
|
96
|
+
patient_data, $patient_number = load_patient_cohort(options)
|
97
|
+
patients_with_hpo_names = translate_codes_to_terms(patient_data, hpo_storage)
|
98
|
+
|
99
|
+
save_translated_file(patients_with_hpo_names, options[:output_file])
|
100
|
+
|
101
|
+
|
102
|
+
Process.exit
|
data/bin/phen2reg.rb
ADDED
@@ -0,0 +1,385 @@
|
|
1
|
+
#! /usr/bin/env ruby
|
2
|
+
# Rojano E. & Seoane P., September 2016
|
3
|
+
# Program to predict the position from given HPO codes, sorted by their association values.
|
4
|
+
|
5
|
+
REPORT_FOLDER=File.expand_path(File.join(File.dirname(__FILE__), '..', 'templates'))
|
6
|
+
ROOT_PATH = File.dirname(__FILE__)
|
7
|
+
$: << File.expand_path(File.join(ROOT_PATH, '..', 'lib', 'gephepred'))
|
8
|
+
require 'net/ftp'
|
9
|
+
require 'net/http'
|
10
|
+
require 'zlib'
|
11
|
+
require 'json'
|
12
|
+
require 'generalMethods.rb'
|
13
|
+
require 'phen2reg_methods.rb'
|
14
|
+
require 'optparse'
|
15
|
+
require 'report_html'
|
16
|
+
|
17
|
+
|
18
|
+
##########################
|
19
|
+
#METHODS
|
20
|
+
##########################
|
21
|
+
|
22
|
+
def calculate_hpo_recovery_and_filter(adjacent_regions_joined, patient_original_phenotypes, predicted_hpo_percentage, min_hpo_recovery_percentage, patient_number)
|
23
|
+
records_to_delete = []
|
24
|
+
counter = 0
|
25
|
+
adjacent_regions_joined.each do |chr, start, stop, hpo_list, association_values, score|
|
26
|
+
hpo_coincidences = patient_original_phenotypes & hpo_list
|
27
|
+
original_hpo_recovery_percentage = hpo_coincidences.length / patient_original_phenotypes.length.to_f * 100
|
28
|
+
records_to_delete << counter if original_hpo_recovery_percentage < min_hpo_recovery_percentage
|
29
|
+
query = predicted_hpo_percentage[patient_number]
|
30
|
+
if query.nil?
|
31
|
+
predicted_hpo_percentage[patient_number] = [original_hpo_recovery_percentage]
|
32
|
+
else
|
33
|
+
query << original_hpo_recovery_percentage
|
34
|
+
end
|
35
|
+
counter += 1
|
36
|
+
end
|
37
|
+
records_to_delete.reverse_each do |record_number|
|
38
|
+
adjacent_regions_joined.delete_at(record_number)
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
42
|
+
def download(ftp_server, path, name)
|
43
|
+
ftp = Net::FTP.new()
|
44
|
+
ftp.connect(ftp_server)
|
45
|
+
ftp.login
|
46
|
+
ftp.getbinaryfile(path, name)
|
47
|
+
ftp.close
|
48
|
+
end
|
49
|
+
|
50
|
+
##########################
|
51
|
+
#OPT-PARSER
|
52
|
+
##########################
|
53
|
+
|
54
|
+
options = {}
|
55
|
+
OptionParser.new do |opts|
|
56
|
+
opts.banner = "Usage: #{__FILE__} [options]"
|
57
|
+
options[:best_thresold] = 1.5
|
58
|
+
opts.on("-b", "--best_thresold FLOAT", "Association value thresold") do |best_thresold|
|
59
|
+
options[:best_thresold] = best_thresold.to_f
|
60
|
+
end
|
61
|
+
|
62
|
+
options[:freedom_degree] = 'prednum'
|
63
|
+
opts.on("-d", "--freedom_degree STRING", "Type of freedom degree calculation: prednum, phennum, maxnum") do |fd|
|
64
|
+
options[:freedom_degree] = fd
|
65
|
+
end
|
66
|
+
|
67
|
+
options[:html_file] = "patient_profile_report.html"
|
68
|
+
opts.on("-F", "--html_file PATH", "HTML file with patient information HPO profile summary") do |html_file|
|
69
|
+
options[:html_file] = html_file
|
70
|
+
end
|
71
|
+
|
72
|
+
options[:hpo_file] = nil
|
73
|
+
opts.on("-f", "--hpo_file PATH", "Input hp.obo file") do |hpo_file|
|
74
|
+
options[:hpo_file] = hpo_file
|
75
|
+
end
|
76
|
+
|
77
|
+
options[:information_coefficient] = nil
|
78
|
+
opts.on("-i", "--information_coefficient PATH", "Input file with information coefficients") do |information_coefficient|
|
79
|
+
options[:information_coefficient] = information_coefficient
|
80
|
+
end
|
81
|
+
|
82
|
+
options[:retrieve_kegg_data] = false
|
83
|
+
opts.on('-k', "--retrieve_kegg_data", "Add KEGG data to prediction report") do
|
84
|
+
options[:retrieve_kegg_data] = true
|
85
|
+
end
|
86
|
+
|
87
|
+
options[:print_matrix] = false
|
88
|
+
opts.on('-m', "--print_matrix", "Print output matrix") do
|
89
|
+
options[:print_matrix] = true
|
90
|
+
end
|
91
|
+
|
92
|
+
options[:max_number] = 10
|
93
|
+
opts.on("-M", "--max_number INTEGER", "Max number of regions to take into account") do |max_number|
|
94
|
+
options[:max_number] = max_number.to_i
|
95
|
+
end
|
96
|
+
|
97
|
+
options[:hpo_is_name] = false
|
98
|
+
opts.on("-n", "--hpo_is_name", "Set this flag if phenotypes are given as names instead of codes") do
|
99
|
+
options[:hpo_is_name] = true
|
100
|
+
end
|
101
|
+
|
102
|
+
options[:output_quality_control] = "output_quality_control.txt"
|
103
|
+
opts.on("-O", "--output_quality_control PATH", "Output file with quality control of all input HPOs") do |output_quality_control|
|
104
|
+
options[:output_quality_control] = output_quality_control
|
105
|
+
end
|
106
|
+
|
107
|
+
options[:output_matrix] = 'output_matrix.txt'
|
108
|
+
opts.on("-o", "--output_matrix PATH", "Output matrix file, with associations for each input HPO") do |output_matrix|
|
109
|
+
options[:output_matrix] = output_matrix
|
110
|
+
end
|
111
|
+
|
112
|
+
options[:prediction_data] = nil
|
113
|
+
#chr\tstart\tstop
|
114
|
+
opts.on("-p", "--prediction_file PATH", "Input data with HPO codes for predicting their location. It can be either, a file path or string with HPO separated by pipes (|)") do |input_path|
|
115
|
+
options[:prediction_data] = input_path
|
116
|
+
end
|
117
|
+
|
118
|
+
options[:pvalue_cutoff] = 0.1
|
119
|
+
opts.on("-P", "--pvalue_cutoff FLOAT", "P-value cutoff") do |pvalue_cutoff|
|
120
|
+
options[:pvalue_cutoff] = pvalue_cutoff.to_f
|
121
|
+
end
|
122
|
+
|
123
|
+
options[:quality_control] = true
|
124
|
+
opts.on("-Q", "--no_quality_control", "Disable quality control") do
|
125
|
+
options[:quality_control] = false
|
126
|
+
end
|
127
|
+
|
128
|
+
options[:ranking_style] = ''
|
129
|
+
opts.on("-r", "--ranking_style STRING", "Ranking style: mean, fisher, geommean") do |ranking_style|
|
130
|
+
options[:ranking_style] = ranking_style
|
131
|
+
end
|
132
|
+
|
133
|
+
options[:write_hpo_recovery_file] = true
|
134
|
+
opts.on("-s", "--write_hpo_recovery_file", "Disable write hpo recovery file") do
|
135
|
+
options[:write_hpo_recovery_file] = false
|
136
|
+
end
|
137
|
+
|
138
|
+
options[:group_by_region] = true
|
139
|
+
opts.on("-S", "--group_by_region", "Disable prediction which HPOs are located in the same region") do
|
140
|
+
options[:group_by_region] = false
|
141
|
+
end
|
142
|
+
|
143
|
+
options[:html_reporting] = true
|
144
|
+
opts.on("-T", "--no_html_reporting", "Disable html reporting") do
|
145
|
+
options[:html_reporting] = false
|
146
|
+
end
|
147
|
+
|
148
|
+
options[:training_file] = nil
|
149
|
+
#chr\tstart\tstop\tphenotype\tassociation_value
|
150
|
+
opts.on("-t", "--training_file PATH", "Input training file, with association values") do |training_path|
|
151
|
+
options[:training_file] = training_path
|
152
|
+
end
|
153
|
+
|
154
|
+
options[:multiple_profile] = false
|
155
|
+
opts.on("-u", "--multiple_profile", "Set if multiple profiles") do
|
156
|
+
options[:multiple_profile] = true
|
157
|
+
end
|
158
|
+
|
159
|
+
options[:hpo_recovery] = 50
|
160
|
+
opts.on("-y", "--hpo_recovery INTEGER", "Minimum percentage of HPO terms to consider predictions") do |hpo_recovery|
|
161
|
+
options[:hpo_recovery] = hpo_recovery.to_f
|
162
|
+
end
|
163
|
+
|
164
|
+
end.parse!
|
165
|
+
|
166
|
+
##########################
|
167
|
+
#PATHS
|
168
|
+
##########################
|
169
|
+
all_paths = {code: File.join(File.dirname(__FILE__), '..')}
|
170
|
+
all_paths[:external_data] = File.join(all_paths[:code], 'external_data')
|
171
|
+
all_paths[:gene_data] = File.join(all_paths[:external_data], 'gene_data.gz')
|
172
|
+
all_paths[:biosystems_gene] = File.join(all_paths[:external_data], 'biosystems_gene.gz')
|
173
|
+
all_paths[:biosystems_info] = File.join(all_paths[:external_data], 'bsid2info.gz')
|
174
|
+
all_paths[:gene_data_with_pathways] = File.join(all_paths[:external_data], 'gene_data_with_pathways.gz')
|
175
|
+
all_paths[:gene_location] = File.join(all_paths[:external_data], 'gene_location.gz')
|
176
|
+
|
177
|
+
##########################
|
178
|
+
#DOWNLOADS
|
179
|
+
##########################
|
180
|
+
sources = [
|
181
|
+
['ftp.ncbi.nlm.nih.gov', 'genomes/H_sapiens/ARCHIVE/ANNOTATION_RELEASE.105/GFF/ref_GRCh37.p13_top_level.gff3.gz', all_paths[:gene_data]],
|
182
|
+
['ftp.ncbi.nlm.nih.gov', 'pub/biosystems/CURRENT/biosystems_gene.gz', all_paths[:biosystems_gene]],
|
183
|
+
['ftp.ncbi.nlm.nih.gov', 'pub/biosystems/CURRENT/bsid2info.gz', all_paths[:biosystems_info]]
|
184
|
+
]
|
185
|
+
sources.each do |server, path, output|
|
186
|
+
download(server, path, output) if !File.exists?(output)
|
187
|
+
end
|
188
|
+
|
189
|
+
##########################
|
190
|
+
#MAIN
|
191
|
+
##########################
|
192
|
+
|
193
|
+
if File.exist?(options[:prediction_data])
|
194
|
+
if !options[:multiple_profile]
|
195
|
+
options[:prediction_data] = [File.open(options[:prediction_data]).readlines.map!{|line| line.chomp}]
|
196
|
+
#STDERR.puts options[:prediction_data].inspect
|
197
|
+
else
|
198
|
+
multiple_profiles = []
|
199
|
+
File.open(options[:prediction_data]).each do |line|
|
200
|
+
line.chomp!
|
201
|
+
multiple_profiles << line.split('|')
|
202
|
+
end
|
203
|
+
options[:prediction_data] = multiple_profiles
|
204
|
+
end
|
205
|
+
else
|
206
|
+
# if you want to add phenotypes through the terminal
|
207
|
+
if !options[:multiple_profile]
|
208
|
+
options[:prediction_data] = [options[:prediction_data].split('|')]
|
209
|
+
else
|
210
|
+
options[:prediction_data] = options[:prediction_data].split('!').map{|profile| profile.split('|')}
|
211
|
+
end
|
212
|
+
end
|
213
|
+
|
214
|
+
##########################
|
215
|
+
#- Loading data
|
216
|
+
|
217
|
+
hpo_storage = load_hpo_file(options[:hpo_file])
|
218
|
+
if options[:quality_control]
|
219
|
+
hpo_child_metadata = get_child_parent_relations(hpo_storage)
|
220
|
+
hpos_ci_values = load_hpo_ci_values(options[:information_coefficient])
|
221
|
+
end
|
222
|
+
|
223
|
+
genes_with_kegg = {}
|
224
|
+
gene_location = {}
|
225
|
+
if options[:retrieve_kegg_data]
|
226
|
+
if !File.exists?(all_paths[:gene_data_with_pathways]) || !File.exists?(all_paths[:gene_location])
|
227
|
+
gene_list, gene_location = load_gene_data(all_paths[:gene_data])
|
228
|
+
### kegg_data = parse_kegg_data(genes_found_attributes.keys)
|
229
|
+
kegg_data = parse_kegg_from_biosystems(all_paths[:biosystems_gene], all_paths[:biosystems_info])
|
230
|
+
genes_with_kegg = merge_genes_with_kegg_data(gene_list, kegg_data)
|
231
|
+
write_compressed_plain_file(genes_with_kegg, all_paths[:gene_data_with_pathways])
|
232
|
+
write_compressed_plain_file(gene_location, all_paths[:gene_location])
|
233
|
+
else
|
234
|
+
gene_location = read_compressed_json(all_paths[:gene_location])
|
235
|
+
genes_with_kegg = read_compressed_json(all_paths[:gene_data_with_pathways])
|
236
|
+
end
|
237
|
+
end
|
238
|
+
|
239
|
+
# hpo_dictionary = load_hpo_dictionary_name2code(options[:hpo2name_file]) if options[:hpo_is_name]
|
240
|
+
trainingData = load_training_file4HPO(options[:training_file], options[:best_thresold])
|
241
|
+
|
242
|
+
##########################
|
243
|
+
#- HPO PROFILE ANALYSIS
|
244
|
+
|
245
|
+
phenotypes_by_patient = {}
|
246
|
+
predicted_hpo_percentage = {}
|
247
|
+
options[:prediction_data].each_with_index do |patient_hpo_profile, patient_number|
|
248
|
+
phenotypes_by_patient[patient_number] = patient_hpo_profile
|
249
|
+
# STDERR.puts patient_hpo_profile.inspect
|
250
|
+
if options[:hpo_is_name]
|
251
|
+
translated_hpos = []
|
252
|
+
hpo_dictionary = create_hpo_dictionary(hpo_storage)
|
253
|
+
patient_hpo_profile.each_with_index do |name, i|
|
254
|
+
hpo_code = hpo_dictionary[name]
|
255
|
+
if hpo_code.nil?
|
256
|
+
#STDERR.puts "Warning! Invalid HPO name: #{name}"
|
257
|
+
hpo_code = nil
|
258
|
+
end
|
259
|
+
patient_hpo_profile[i] = hpo_code
|
260
|
+
end
|
261
|
+
patient_hpo_profile.compact!
|
262
|
+
end
|
263
|
+
|
264
|
+
#HPO quality control
|
265
|
+
#---------------------------
|
266
|
+
characterised_hpos = []
|
267
|
+
#hpo_metadata = []
|
268
|
+
if options[:quality_control]
|
269
|
+
#characterised_hpos, hpo_metadata = hpo_quality_control(options[:prediction_data], options[:hpo2name_file], options[:information_coefficient])
|
270
|
+
# characterised_hpos, hpo_storage = hpo_quality_control(patient_hpo_profile, hpo_storage, hpo_child_metadata, hpos_ci_values)
|
271
|
+
characterised_hpos = hpo_quality_control(patient_hpo_profile, hpo_storage, hpo_child_metadata, hpos_ci_values)
|
272
|
+
output_quality_control = File.open(options[:output_quality_control], "w")
|
273
|
+
header = ["HPO name", "HPO code", "Exists?", "CI value", "Is child of", "Childs"]
|
274
|
+
output_quality_control.puts Terminal::Table.new :headings => header, :rows => characterised_hpos
|
275
|
+
output_quality_control.close
|
276
|
+
end
|
277
|
+
|
278
|
+
#Prediction steps
|
279
|
+
#---------------------------
|
280
|
+
hpo_regions = search4HPO(patient_hpo_profile, trainingData)
|
281
|
+
if hpo_regions.empty?
|
282
|
+
puts "ProfID:#{patient_number}\tResults not found"
|
283
|
+
elsif options[:group_by_region] == false
|
284
|
+
hpo_regions.each do |hpo, regions|
|
285
|
+
regions.each do |region|
|
286
|
+
puts "ProfID:#{patient_number}\t#{hpo}\t#{region.join("\t")}"
|
287
|
+
end
|
288
|
+
end
|
289
|
+
elsif options[:group_by_region] == true
|
290
|
+
region2hpo, regionAttributes, association_scores = group_by_region(hpo_regions)
|
291
|
+
#STDERR.puts patient_hpo_profile.inspect
|
292
|
+
#add_parentals_of_not_found_hpos_in_regions(patient_hpo_profile, trainingData, region2hpo, regionAttributes, association_scores, hpo_metadata)
|
293
|
+
#STDERR.puts patient_hpo_profile.inspect
|
294
|
+
null_value = 0
|
295
|
+
hpo_region_matrix = generate_hpo_region_matrix(region2hpo, association_scores, patient_hpo_profile, null_value)
|
296
|
+
if options[:print_matrix]
|
297
|
+
output_matrix = File.open(options[:output_matrix] + "_#{patient_number}", "w")
|
298
|
+
output_matrix.puts "Region\t#{patient_hpo_profile.join("\t")}"
|
299
|
+
regionAttributes_array = regionAttributes.values
|
300
|
+
hpo_region_matrix.each_with_index do |association_values, i|
|
301
|
+
chr, start, stop = regionAttributes_array[i]
|
302
|
+
output_matrix.puts "#{chr}:#{start}-#{stop}\t#{association_values.join("\t")}"
|
303
|
+
end
|
304
|
+
output_matrix.close
|
305
|
+
end
|
306
|
+
|
307
|
+
|
308
|
+
scoring_regions(regionAttributes, hpo_region_matrix, options[:ranking_style], options[:pvalue_cutoff], options[:freedom_degree], null_value)
|
309
|
+
if regionAttributes.empty?
|
310
|
+
puts "ProfID:#{patient_number}\tResults not found"
|
311
|
+
else
|
312
|
+
adjacent_regions_joined = []
|
313
|
+
regionAttributes.each do |regionID, attributes|
|
314
|
+
chr, start, stop, patient_ID, region_length, score = attributes
|
315
|
+
association_values = association_scores[regionID]
|
316
|
+
adjacent_regions_joined << [chr, start, stop, association_values.keys, association_values.values, score]
|
317
|
+
end
|
318
|
+
adjacent_regions_joined = join_regions(adjacent_regions_joined) # MOVER A ANTES DE CONSTRUIR LA MATRIZ
|
319
|
+
|
320
|
+
#Ranking
|
321
|
+
if options[:ranking_style] == 'fisher'
|
322
|
+
adjacent_regions_joined.sort!{|r1, r2| r1.last <=> r2.last}
|
323
|
+
else
|
324
|
+
adjacent_regions_joined.sort!{|r1, r2| r2.last <=> r1.last}
|
325
|
+
end
|
326
|
+
patient_original_phenotypes = phenotypes_by_patient[patient_number]
|
327
|
+
calculate_hpo_recovery_and_filter(adjacent_regions_joined, patient_original_phenotypes, predicted_hpo_percentage, options[:hpo_recovery], patient_number)
|
328
|
+
if adjacent_regions_joined.empty?
|
329
|
+
puts "ProfID:#{patient_number}\tResults not found"
|
330
|
+
else
|
331
|
+
adjacent_regions_joined = adjacent_regions_joined[0..options[:max_number]-1] if !options[:max_number].nil?
|
332
|
+
adjacent_regions_joined.each do |chr, start, stop, hpo_list, association_values, score|
|
333
|
+
puts "ProfID:#{patient_number}\t#{chr}\t#{start}\t#{stop}\t#{hpo_list.join(',')}\t#{association_values.join(',')}\t#{score}"
|
334
|
+
end
|
335
|
+
end
|
336
|
+
end
|
337
|
+
end #elsif
|
338
|
+
|
339
|
+
pathway_stats = {}
|
340
|
+
if options[:retrieve_kegg_data]
|
341
|
+
genes_found = []
|
342
|
+
genes_found_attributes = {}
|
343
|
+
adjacent_regions_joined.each do |adjacent_region|
|
344
|
+
ref_chr, ref_start, ref_stop = adjacent_region
|
345
|
+
chr_genes = gene_location[ref_chr]
|
346
|
+
genes = []
|
347
|
+
chr_genes.each do |gene_name, gene_start, gene_stop|
|
348
|
+
if (ref_start > gene_start && ref_stop < gene_stop) ||
|
349
|
+
(ref_start < gene_start && ref_stop > gene_stop) ||
|
350
|
+
(ref_start < gene_start && ref_stop > gene_start) ||
|
351
|
+
(ref_start < gene_stop && ref_stop > gene_stop)
|
352
|
+
genes << gene_name
|
353
|
+
end
|
354
|
+
end
|
355
|
+
genes_found << genes
|
356
|
+
end
|
357
|
+
|
358
|
+
genes_with_kegg_data = []
|
359
|
+
genes_found.each do |genes|
|
360
|
+
genes_cluster = []
|
361
|
+
genes.each do |gene|
|
362
|
+
query = genes_with_kegg[gene]
|
363
|
+
genes_cluster << [gene, query]
|
364
|
+
end
|
365
|
+
genes_with_kegg_data << genes_cluster
|
366
|
+
end
|
367
|
+
pathway_stats = compute_pathway_enrichment(genes_with_kegg_data, genes_with_kegg)
|
368
|
+
pathway_stats.sort!{|p1, p2| p1.last <=> p2.last}
|
369
|
+
end
|
370
|
+
|
371
|
+
#Creating html report
|
372
|
+
#-------------------
|
373
|
+
####PLEASE CHECK THIS METHOD!
|
374
|
+
report_data(characterised_hpos, adjacent_regions_joined, options[:html_file], hpo_storage, genes_with_kegg_data, pathway_stats) if options[:html_reporting]
|
375
|
+
end # end each_with_index
|
376
|
+
|
377
|
+
if options[:write_hpo_recovery_file]
|
378
|
+
handler = File.open('output_profile_recovery', 'w')
|
379
|
+
predicted_hpo_percentage.each do |patient, percentage|
|
380
|
+
percentage.each do |perc|
|
381
|
+
handler.puts "ProfID:#{patient}\t#{perc.inspect}"
|
382
|
+
end
|
383
|
+
end
|
384
|
+
handler.close
|
385
|
+
end
|