pets 0.1.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +11 -0
- data/.rspec +3 -0
- data/.travis.yml +7 -0
- data/Gemfile +4 -0
- data/LICENSE.txt +21 -0
- data/README.md +41 -0
- data/Rakefile +6 -0
- data/bin/area_under_curve_pr.rb +118 -0
- data/bin/association_metrics_average.rb +94 -0
- data/bin/coPatReporter.rb +531 -0
- data/bin/console +14 -0
- data/bin/fmeasure_index.rb +72 -0
- data/bin/get_PR_values.rb +90 -0
- data/bin/get_clusters.R +18 -0
- data/bin/get_network_nodes.rb +197 -0
- data/bin/lines.R +77 -0
- data/bin/merge_by_cluster.rb +62 -0
- data/bin/merge_pairs.rb +138 -0
- data/bin/paco_translator.rb +102 -0
- data/bin/phen2reg.rb +385 -0
- data/bin/phen2reg_predictor_check.rb +297 -0
- data/bin/plot_area.R +71 -0
- data/bin/plot_boxplot.R +21 -0
- data/bin/plot_density.R +46 -0
- data/bin/plot_scatterplot.R +25 -0
- data/bin/reg2phen.rb +116 -0
- data/bin/region_to_patients_generator.rb +84 -0
- data/bin/relate_CI_to_association_value.rb +90 -0
- data/bin/setup +8 -0
- data/bin/standardize_scores.R +40 -0
- data/bin/xyplot_graph.R +60 -0
- data/external_data/biosystems_gene.gz +0 -0
- data/external_data/bsid2info.gz +0 -0
- data/external_data/chromosome_sizes_hg19.txt +24 -0
- data/external_data/gene_data.gz +0 -0
- data/external_data/gene_data_with_pathways.gz +0 -0
- data/external_data/gene_location.gz +0 -0
- data/external_data/hp.obo +146363 -0
- data/external_data/remove +0 -0
- data/lib/pets.rb +6 -0
- data/lib/pets/coPatReporterMethods.rb +77 -0
- data/lib/pets/generalMethods.rb +556 -0
- data/lib/pets/phen2reg_methods.rb +432 -0
- data/lib/pets/version.rb +3 -0
- data/pets.gemspec +47 -0
- data/templates/cohort_report.erb +93 -0
- data/templates/patient_report.erb +209 -0
- metadata +183 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: a629423c3668446b726dc91d984efe15e730ed48
|
4
|
+
data.tar.gz: 11b466ef84cdf9d84354fb54f7c5371fee6fe067
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: c361cd825328b9265851eef94ae704e2bfc4c88c215b193ee10edb04e2caa93d538c66c7701d09b2ae12c018921f8d579a133705a5ab16e342e5cf961abbd370
|
7
|
+
data.tar.gz: 8acfa703181a2787f8e50c36b0dd2eb8704b7f3f10f6b45e2a7777e3a7c144182c94ffad57a919efb51c6c22817fff436c1b549f6eee5e7fc5ea07853f6a5512
|
data/.gitignore
ADDED
data/.rspec
ADDED
data/.travis.yml
ADDED
data/Gemfile
ADDED
data/LICENSE.txt
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
The MIT License (MIT)
|
2
|
+
|
3
|
+
Copyright (c) 2019 elenarojano
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
7
|
+
in the Software without restriction, including without limitation the rights
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
10
|
+
furnished to do so, subject to the following conditions:
|
11
|
+
|
12
|
+
The above copyright notice and this permission notice shall be included in
|
13
|
+
all copies or substantial portions of the Software.
|
14
|
+
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
21
|
+
THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,41 @@
|
|
1
|
+
# Gephepred
|
2
|
+
|
3
|
+
Welcome to your new gem! In this directory, you'll find the files you need to be able to package up your Ruby library into a gem. Put your Ruby code in the file `lib/gephepred`. To experiment with that code, run `bin/console` for an interactive prompt.
|
4
|
+
|
5
|
+
TODO: Delete this and the text above, and describe your gem
|
6
|
+
|
7
|
+
## Installation
|
8
|
+
|
9
|
+
Add this line to your application's Gemfile:
|
10
|
+
|
11
|
+
```ruby
|
12
|
+
gem 'gephepred'
|
13
|
+
```
|
14
|
+
|
15
|
+
And then execute:
|
16
|
+
|
17
|
+
$ bundle
|
18
|
+
|
19
|
+
Or install it yourself as:
|
20
|
+
|
21
|
+
$ gem install gephepred
|
22
|
+
|
23
|
+
## Usage
|
24
|
+
|
25
|
+
TODO: Write usage instructions here
|
26
|
+
|
27
|
+
## Development
|
28
|
+
|
29
|
+
After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake spec` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
|
30
|
+
|
31
|
+
To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release`, which will create a git tag for the version, push git commits and tags, and push the `.gem` file to [rubygems.org](https://rubygems.org).
|
32
|
+
|
33
|
+
## Contributing
|
34
|
+
|
35
|
+
Bug reports and pull requests are welcome on GitHub at https://github.com/[USERNAME]/gephepred.
|
36
|
+
|
37
|
+
|
38
|
+
## License
|
39
|
+
|
40
|
+
The gem is available as open source under the terms of the [MIT License](http://opensource.org/licenses/MIT).
|
41
|
+
|
data/Rakefile
ADDED
@@ -0,0 +1,118 @@
|
|
1
|
+
#! /usr/bin/env ruby
|
2
|
+
|
3
|
+
#Tool for calculate the AUC on PR curves.
|
4
|
+
|
5
|
+
##########################
|
6
|
+
#LIBRARIES
|
7
|
+
##########################
|
8
|
+
|
9
|
+
require 'optparse'
|
10
|
+
|
11
|
+
##########################
|
12
|
+
#METHODS
|
13
|
+
##########################
|
14
|
+
|
15
|
+
def load_file(input_file, x_val_col, y_val_col)
|
16
|
+
coordinates = []
|
17
|
+
File.open(input_file).each do |line|
|
18
|
+
line.chomp!
|
19
|
+
next if line.include?('prec') || line.include?('rec')
|
20
|
+
info = line.split("\t")
|
21
|
+
x_value = info[x_val_col - 1].to_f
|
22
|
+
y_value = info[y_val_col - 1].to_f
|
23
|
+
#STDERR.puts y_value
|
24
|
+
coordinates << [x_value, y_value]
|
25
|
+
end
|
26
|
+
return coordinates.sort{|r1, r2| r1[0] <=> r2[0]}
|
27
|
+
end
|
28
|
+
|
29
|
+
|
30
|
+
def calculate_auc(pr_values)
|
31
|
+
#pr_values = [[x, y], [x', y']...]
|
32
|
+
x_val = 0
|
33
|
+
y_val = 0
|
34
|
+
total_area = 0
|
35
|
+
pr_values.each_with_index do |xy_pair, counter|
|
36
|
+
if counter != 0
|
37
|
+
current_x = xy_pair[0]
|
38
|
+
current_y = xy_pair[1]
|
39
|
+
#puts x_val
|
40
|
+
total_area += (x_val - current_x).abs * current_y
|
41
|
+
#STDERR.puts total_square_area
|
42
|
+
total_area += (x_val - current_x).abs * (y_val - current_y).abs / 2
|
43
|
+
x_val = current_x
|
44
|
+
y_val = current_y
|
45
|
+
else
|
46
|
+
x_val = xy_pair[0]
|
47
|
+
y_val = xy_pair[1]
|
48
|
+
total_area += x_val * y_val
|
49
|
+
#STDERR.puts total_area
|
50
|
+
end
|
51
|
+
#STDERR.puts total_area
|
52
|
+
end
|
53
|
+
#STDERR.puts total_area
|
54
|
+
return total_area
|
55
|
+
end
|
56
|
+
|
57
|
+
# def calculate_auc(pr_values)
|
58
|
+
# #pr_values = [[x, y], [x', y']...]
|
59
|
+
# counter = 0
|
60
|
+
# x_val = 0
|
61
|
+
# y_val = 0
|
62
|
+
# total_square_area = 0
|
63
|
+
# total_triangle_area = 0
|
64
|
+
# pr_values.each do |xy_pair|
|
65
|
+
# if counter != 0
|
66
|
+
# x_prime = xy_pair[0]
|
67
|
+
# y_prime = xy_pair[1]
|
68
|
+
# #puts "#{x_prime}\t#{x_val}"
|
69
|
+
# #puts "#{y_prime}\t#{y_val}"
|
70
|
+
# total_square_area += (x_val - x_prime) * y_prime
|
71
|
+
# total_triangle_area += (x_val - x_prime) * (y_prime - y_val) / 2
|
72
|
+
# x_val = x_prime
|
73
|
+
# y_val = y_prime
|
74
|
+
# else
|
75
|
+
# x_val = xy_pair[0]
|
76
|
+
# y_val = xy_pair[1]
|
77
|
+
# counter += 1
|
78
|
+
# end
|
79
|
+
# end
|
80
|
+
# total_area = total_square_area + total_triangle_area
|
81
|
+
# STDERR.puts total_area
|
82
|
+
# return total_area
|
83
|
+
# end
|
84
|
+
|
85
|
+
|
86
|
+
##########################
|
87
|
+
#OPT-PARSE
|
88
|
+
##########################
|
89
|
+
|
90
|
+
options = {}
|
91
|
+
OptionParser.new do |opts|
|
92
|
+
opts.banner = "Usage: #{__FILE__} [options]"
|
93
|
+
|
94
|
+
options[:input_file] = nil
|
95
|
+
opts.on("-f", "--input_file PATH", "Precision-recall values file") do |input_file|
|
96
|
+
options[:input_file] = input_file
|
97
|
+
end
|
98
|
+
|
99
|
+
options[:x_values] = nil
|
100
|
+
opts.on("-x", "--:x_values INTEGER", "Set column for extracting x values") do |x_values|
|
101
|
+
options[:x_values] = x_values.to_i
|
102
|
+
end
|
103
|
+
|
104
|
+
options[:y_values] = nil
|
105
|
+
opts.on("-y", "--:y_values INTEGER", "Set column for extracting y values") do |y_values|
|
106
|
+
options[:y_values] = y_values.to_i
|
107
|
+
end
|
108
|
+
|
109
|
+
end.parse!
|
110
|
+
|
111
|
+
##########################
|
112
|
+
#MAIN
|
113
|
+
##########################
|
114
|
+
|
115
|
+
pr_values = load_file(options[:input_file], options[:x_values], options[:y_values])
|
116
|
+
#puts pr_values
|
117
|
+
final_area = calculate_auc(pr_values)
|
118
|
+
puts final_area
|
@@ -0,0 +1,94 @@
|
|
1
|
+
#! /usr/bin/env ruby
|
2
|
+
#Tool for calculating averages between different association values file.
|
3
|
+
#File structure: prec rec cut meth
|
4
|
+
#Load all files (7) stored in the same directory and calculate average;
|
5
|
+
#of lines for each method. Return a file with the same structure;
|
6
|
+
#giving name as "average" to the last column
|
7
|
+
|
8
|
+
require 'optparse'
|
9
|
+
|
10
|
+
##########################
|
11
|
+
#METHODS
|
12
|
+
##########################
|
13
|
+
|
14
|
+
def load_association_file(filename)
|
15
|
+
fileInfo = []
|
16
|
+
header = ''
|
17
|
+
line_number = 0
|
18
|
+
File.open(filename).each do |line|
|
19
|
+
line.chomp!
|
20
|
+
if line_number == 0
|
21
|
+
header = line
|
22
|
+
else
|
23
|
+
cut, precision, recall, meth = line.split("\t")
|
24
|
+
fileInfo << [cut.to_f, precision.to_f, recall.to_f, meth]
|
25
|
+
end
|
26
|
+
line_number += 1
|
27
|
+
end
|
28
|
+
return fileInfo, header
|
29
|
+
end
|
30
|
+
|
31
|
+
def calculate_average(all_files, cols_for_average)
|
32
|
+
average = []
|
33
|
+
n_files = all_files.length.to_f
|
34
|
+
ref_file = all_files.shift
|
35
|
+
summatory_file = []
|
36
|
+
ref_file.each_with_index do |line, i|
|
37
|
+
all_files.each do |file|
|
38
|
+
line2 = file[i]
|
39
|
+
cols_for_average.each do |col|
|
40
|
+
line[col] = line[col] + line2[col]
|
41
|
+
end
|
42
|
+
end
|
43
|
+
summatory_file << line
|
44
|
+
end
|
45
|
+
summatory_file.each do |line|
|
46
|
+
cols_for_average.each do |col|
|
47
|
+
line[col] = line[col]/n_files
|
48
|
+
end
|
49
|
+
average << line
|
50
|
+
end
|
51
|
+
return average
|
52
|
+
end
|
53
|
+
|
54
|
+
|
55
|
+
##########################
|
56
|
+
#OPT-PARSER
|
57
|
+
##########################
|
58
|
+
|
59
|
+
options = {}
|
60
|
+
OptionParser.new do |opts|
|
61
|
+
opts.banner = "Usage: #{__FILE__} [options]"
|
62
|
+
|
63
|
+
options[:file_names] = nil
|
64
|
+
opts.on("-f", "--file_names STRING", "Input file names to calculate averages. Please separate names by commas") do |file_names|
|
65
|
+
options[:file_names] = file_names.split(',')
|
66
|
+
end
|
67
|
+
|
68
|
+
options[:which_cols] = nil
|
69
|
+
opts.on("-c", "--which_cols STRING", "Cols for performing average analysis") do |which_cols|
|
70
|
+
options[:which_cols] = which_cols.split(',').map{|i| i.to_i - 1}
|
71
|
+
end
|
72
|
+
|
73
|
+
end.parse!
|
74
|
+
|
75
|
+
##########################
|
76
|
+
#MAIN
|
77
|
+
##########################
|
78
|
+
|
79
|
+
all_files = []
|
80
|
+
header = nil
|
81
|
+
options[:file_names].each do |filename|
|
82
|
+
file, header = load_association_file(filename)
|
83
|
+
all_files << file
|
84
|
+
end
|
85
|
+
|
86
|
+
average = calculate_average(all_files, options[:which_cols])
|
87
|
+
|
88
|
+
puts header
|
89
|
+
average.each do |line|
|
90
|
+
puts line.join("\t")
|
91
|
+
end
|
92
|
+
|
93
|
+
|
94
|
+
|
@@ -0,0 +1,531 @@
|
|
1
|
+
#! /usr/bin/env ruby
|
2
|
+
|
3
|
+
ROOT_PATH = File.dirname(__FILE__)
|
4
|
+
REPORT_FOLDER = File.expand_path(File.join(ROOT_PATH, '..', 'templates'))
|
5
|
+
EXTERNAL_DATA = File.expand_path(File.join(ROOT_PATH, '..', 'external_data'))
|
6
|
+
HPO_FILE = File.join(EXTERNAL_DATA, 'hp.obo')
|
7
|
+
IC_FILE = File.join(EXTERNAL_DATA, 'uniq_hpo_with_CI.txt')
|
8
|
+
CHR_SIZE = File.join(EXTERNAL_DATA, 'chromosome_sizes_hg19.txt')
|
9
|
+
$: << File.expand_path(File.join(ROOT_PATH, '..', 'lib', 'gephepred'))
|
10
|
+
|
11
|
+
require 'optparse'
|
12
|
+
require 'csv'
|
13
|
+
require 'generalMethods.rb'
|
14
|
+
require 'coPatReporterMethods.rb'
|
15
|
+
require 'report_html'
|
16
|
+
|
17
|
+
##########################
|
18
|
+
#METHODS
|
19
|
+
##########################
|
20
|
+
HPOS = 0
|
21
|
+
CHR = 1
|
22
|
+
START = 2
|
23
|
+
STOP = 3
|
24
|
+
|
25
|
+
def format_patient_data(patient_data, options, name2code_dictionary, hpo_storage, hpo_parent_child_relations)
|
26
|
+
all_hpo = []
|
27
|
+
rejected_hpos = []
|
28
|
+
suggested_childs = {}
|
29
|
+
patient_data.each do |pat_id, patient_record|
|
30
|
+
string_hpos, chr, start, stop = patient_record
|
31
|
+
hpos = string_hpos.split(options[:hpo_separator])
|
32
|
+
translate_hpo_names2codes(hpos, name2code_dictionary, pat_id, rejected_hpos) if options[:hpo_names]
|
33
|
+
suggested_childs[pat_id] = check_hpo_codes(hpos, hpo_storage, hpo_parent_child_relations, pat_id, rejected_hpos)
|
34
|
+
all_hpo.concat(hpos)
|
35
|
+
patient_record[HPOS] = hpos
|
36
|
+
patient_record[START] = start.to_i if !start.nil?
|
37
|
+
patient_record[STOP] = stop.to_i if !stop.nil?
|
38
|
+
end
|
39
|
+
return all_hpo.uniq, suggested_childs, rejected_hpos.uniq
|
40
|
+
end
|
41
|
+
|
42
|
+
def translate_hpo_names2codes(hpos, hpo_dictionary, pat_id, rejected_hpos)
|
43
|
+
hpo_codes = []
|
44
|
+
hpos.each_with_index do |hpo_name, i|
|
45
|
+
hpo_code = hpo_dictionary[hpo_name]
|
46
|
+
if hpo_code.nil?
|
47
|
+
STDERR.puts "WARNING: patient #{pat_id} has the unknown hpo NAME '#{hpo_name}'. Rejected."
|
48
|
+
rejected_hpos << hpo_name
|
49
|
+
else
|
50
|
+
hpo_codes << hpo_code
|
51
|
+
end
|
52
|
+
end
|
53
|
+
hpos.clear
|
54
|
+
hpos.concat(hpo_codes)
|
55
|
+
end
|
56
|
+
|
57
|
+
def check_hpo_codes(hpos, hpo_storage, hpo_parent_child_relations, pat_id, rejected_hpos)
|
58
|
+
more_specific_hpo = []
|
59
|
+
hpos.each_with_index do |hpo_code, i|
|
60
|
+
hpo_data = hpo_storage[hpo_code]
|
61
|
+
if hpo_data.nil?
|
62
|
+
hpos[i] = nil
|
63
|
+
STDERR.puts "WARNING: patient #{pat_id} has the unknown hpo CODE '#{hpo_code}'. Rejected."
|
64
|
+
rejected_hpos << hpo_code
|
65
|
+
else
|
66
|
+
main_hpo_code, name = hpo_data
|
67
|
+
hpos[i] = main_hpo_code # change from alternate hpo codes to the main ones
|
68
|
+
childs = hpo_parent_child_relations[main_hpo_code]
|
69
|
+
if childs.nil?
|
70
|
+
specific_childs = []
|
71
|
+
else
|
72
|
+
specific_childs = childs
|
73
|
+
end
|
74
|
+
more_specific_hpo << [[main_hpo_code, name], specific_childs]
|
75
|
+
end
|
76
|
+
end
|
77
|
+
hpos.compact!
|
78
|
+
return more_specific_hpo
|
79
|
+
end
|
80
|
+
|
81
|
+
def generate_patient_hpo_matrix(patient_data, cohort_hpos)
|
82
|
+
matrix = []
|
83
|
+
n = cohort_hpos.length
|
84
|
+
patient_data.each do |pat_id, patient_record|
|
85
|
+
pat_hpos = patient_record[HPOS]
|
86
|
+
vector = Array.new(n, 0)
|
87
|
+
pat_hpos.each do |hpo|
|
88
|
+
vector[cohort_hpos.index(hpo)] = 1
|
89
|
+
end
|
90
|
+
matrix << vector
|
91
|
+
end
|
92
|
+
return matrix
|
93
|
+
end
|
94
|
+
|
95
|
+
def write_matrix_for_R(matrix, x_names, y_names, file)
|
96
|
+
File.open(file, 'w') do |f|
|
97
|
+
f.puts x_names.join("\t")
|
98
|
+
matrix.each_with_index do |row, i|
|
99
|
+
f.puts [y_names[i]].concat(row).join("\t")
|
100
|
+
end
|
101
|
+
end
|
102
|
+
end
|
103
|
+
|
104
|
+
def process_clustered_patients(options, clustered_patients, patient_data) # get ic and chromosomes
|
105
|
+
if options[:ic_stats]
|
106
|
+
ic_file = ENV['ic_file']
|
107
|
+
ic_file = IC_FILE if ic_file.nil?
|
108
|
+
phenotype_ic = load_hpo_ci_values(ic_file)
|
109
|
+
else
|
110
|
+
phenotype_ic = compute_IC_values(patient_data, $patient_number)
|
111
|
+
end
|
112
|
+
all_ics = []
|
113
|
+
top_cluster_phenotypes = []
|
114
|
+
cluster_data_by_chromosomes = []
|
115
|
+
multi_chromosome_patients = 0
|
116
|
+
processed_clusters = 0
|
117
|
+
clustered_patients.sort_by{|cl_id, pat_ids| pat_ids.length }.reverse.each do |cluster_id, patient_ids|
|
118
|
+
num_of_patients = patient_ids.length
|
119
|
+
next if num_of_patients == 1
|
120
|
+
chrs = Hash.new(0)
|
121
|
+
all_phens = []
|
122
|
+
profile_ics = []
|
123
|
+
patient_ids.each do |pat_id|
|
124
|
+
patient = patient_data[pat_id]
|
125
|
+
phenotypes = patient[HPOS]
|
126
|
+
profile_ics << get_profile_ic(phenotypes, phenotype_ic)
|
127
|
+
#optional
|
128
|
+
all_phens << phenotypes if processed_clusters < options[:clusters2show_detailed_phen_data]
|
129
|
+
chrs[patient[CHR]] += 1 if !options[:chromosome_col].nil?
|
130
|
+
end
|
131
|
+
top_cluster_phenotypes << all_phens if processed_clusters < options[:clusters2show_detailed_phen_data]
|
132
|
+
all_ics << profile_ics
|
133
|
+
# STDERR.puts [cluster_id, num_of_patients, chr, count].inspect
|
134
|
+
if !options[:chromosome_col].nil?
|
135
|
+
multi_chromosome_patients += num_of_patients if chrs.length > 1
|
136
|
+
chrs.each do |chr, count|
|
137
|
+
cluster_data_by_chromosomes << [cluster_id, num_of_patients, chr, count]
|
138
|
+
end
|
139
|
+
end
|
140
|
+
processed_clusters += 1
|
141
|
+
end
|
142
|
+
# STDERR.puts cluster_data_by_chromosomes.inspect
|
143
|
+
return all_ics, cluster_data_by_chromosomes, top_cluster_phenotypes, multi_chromosome_patients
|
144
|
+
end
|
145
|
+
|
146
|
+
def get_profile_ic(hpo_names, phenotype_ic)
|
147
|
+
ic = 0
|
148
|
+
profile_length = 0
|
149
|
+
hpo_names.each do |hpo_id|
|
150
|
+
hpo_ic = phenotype_ic[hpo_id]
|
151
|
+
# STDERR.puts phenotype_ic.inspect
|
152
|
+
ic += hpo_ic if !hpo_ic.nil?
|
153
|
+
profile_length += 1
|
154
|
+
end
|
155
|
+
profile_length = 1 if profile_length == 0
|
156
|
+
return ic.fdiv(profile_length)
|
157
|
+
end
|
158
|
+
|
159
|
+
def write_cluster_ic_data(all_ics, cluster_ic_data_file, limit)
|
160
|
+
File.open(cluster_ic_data_file, 'w') do |f|
|
161
|
+
f.puts %w[cluster_id ic].join("\t")
|
162
|
+
all_ics.each_with_index do |cluster_ics, i|
|
163
|
+
break if i == limit
|
164
|
+
cluster_length = cluster_ics.length
|
165
|
+
cluster_ics.each do |clust_ic|
|
166
|
+
f.puts "#{cluster_length}_#{i}\t#{clust_ic}"
|
167
|
+
end
|
168
|
+
end
|
169
|
+
end
|
170
|
+
end
|
171
|
+
|
172
|
+
def write_cluster_chromosome_data(cluster_data, cluster_chromosome_data_file, limit)
|
173
|
+
File.open(cluster_chromosome_data_file, 'w') do |f|
|
174
|
+
f.puts %w[cluster_id chr count].join("\t")
|
175
|
+
index = 0
|
176
|
+
last_id = cluster_data.first.first unless cluster_data.empty?
|
177
|
+
cluster_data.each do |cluster_id, patient_number, chr, count|
|
178
|
+
index += 1 if cluster_id != last_id
|
179
|
+
break if index == limit
|
180
|
+
f.puts ["#{patient_number}_#{index}", chr, count].join("\t")
|
181
|
+
last_id = cluster_id
|
182
|
+
end
|
183
|
+
end
|
184
|
+
end
|
185
|
+
|
186
|
+
def write_coverage_data(coverage_to_plot, coverage_to_plot_file)
|
187
|
+
File.open(coverage_to_plot_file, 'w') do |f|
|
188
|
+
coverage_to_plot.each do |chr, position, freq|
|
189
|
+
f.puts "#{chr}\t#{position}\t#{freq}"
|
190
|
+
end
|
191
|
+
end
|
192
|
+
end
|
193
|
+
|
194
|
+
def get_hpo_profile(patient_data)
|
195
|
+
hpo_profiles = []
|
196
|
+
patient_data.each do |pat_id, pat_data|
|
197
|
+
hpo_profiles << pat_data[HPOS]
|
198
|
+
end
|
199
|
+
return hpo_profiles
|
200
|
+
end
|
201
|
+
|
202
|
+
def get_summary_stats(patient_data, cohort_hpos, all_hpo_profiles)
|
203
|
+
stats = []
|
204
|
+
ids = []
|
205
|
+
stats << ['Unique HPOs', cohort_hpos.length]
|
206
|
+
patient_ids = patient_data.keys
|
207
|
+
patient_ids.each do |pat_id|
|
208
|
+
id, count = pat_id.split('_i')
|
209
|
+
ids << id
|
210
|
+
end
|
211
|
+
n_pat = ids.uniq.length
|
212
|
+
stats << ['Number of patients in the cohort', n_pat]
|
213
|
+
all_hpo_prof_lengths = all_hpo_profiles.map{|p| p.length}.sort
|
214
|
+
stats << ['HPOs per patient (average)', all_hpo_prof_lengths.inject(0){|sum, n| sum + n}.fdiv(n_pat).round(4)]
|
215
|
+
hpo_pat90 = nil
|
216
|
+
rate = 0
|
217
|
+
count = 0
|
218
|
+
while rate <= 0.1
|
219
|
+
hpo_pat90 = all_hpo_prof_lengths[count+1]
|
220
|
+
rate = count.fdiv(n_pat)
|
221
|
+
count += 1
|
222
|
+
end
|
223
|
+
stats << ['HPOs for patient in percentile 90', hpo_pat90]
|
224
|
+
return stats
|
225
|
+
end
|
226
|
+
|
227
|
+
def hpo_stats(all_hpo_profiles)
|
228
|
+
stats = Hash.new(0)
|
229
|
+
all_hpo_profiles.each do |profile|
|
230
|
+
profile.each do |hpo|
|
231
|
+
stats[hpo] += 1
|
232
|
+
end
|
233
|
+
end
|
234
|
+
n_profiles = all_hpo_profiles.length
|
235
|
+
hpo_stats = []
|
236
|
+
stats.each do |hpo, count|
|
237
|
+
hpo_stats << [hpo, count.fdiv(n_profiles)*100]
|
238
|
+
end
|
239
|
+
hpo_stats.sort!{|h1, h2| h2[1] <=> h1[1]}
|
240
|
+
return hpo_stats[0..20]
|
241
|
+
end
|
242
|
+
|
243
|
+
def translate_hpo_codes2names(all_hpo_profiles, hpo_storage)
|
244
|
+
all_hpo_profiles.each do |profile|
|
245
|
+
profile.each_with_index do |hpo, i|
|
246
|
+
hpo_data = hpo_storage[hpo]
|
247
|
+
if hpo_data.nil?
|
248
|
+
STDERR.puts "WARNING: hpo code '#{hpo}' not exists."
|
249
|
+
else
|
250
|
+
profile[i] = hpo_data[1]
|
251
|
+
end
|
252
|
+
end
|
253
|
+
end
|
254
|
+
end
|
255
|
+
|
256
|
+
def write_detailed_hpo_profile_evaluation(suggested_childs, detailed_profile_evaluation_file, summary_stats)
|
257
|
+
hpo_count = 0
|
258
|
+
parent_hpo_count = 0
|
259
|
+
CSV.open(detailed_profile_evaluation_file, "wb") do |csv|
|
260
|
+
suggested_childs.each do |pat_id, suggestions|
|
261
|
+
warning = nil
|
262
|
+
warning = 'WARNING: Very few phenotypes' if suggestions.length < 4
|
263
|
+
csv << ["PATIENT #{pat_id}", "#{warning}"]
|
264
|
+
csv << ["CURRENT PHENOTYPES", "PUTATIVE MORE SPECIFIC PHENOTYPES"]
|
265
|
+
suggestions.each do |parent, childs|
|
266
|
+
hpo_count += 1
|
267
|
+
parent_code, parent_name = parent
|
268
|
+
if childs.empty?
|
269
|
+
csv << ["#{parent_name} (#{parent_code})", '-']
|
270
|
+
else
|
271
|
+
parent_hpo_count += 1
|
272
|
+
parent_writed = false
|
273
|
+
childs.each do |child_code, child_name|
|
274
|
+
if !parent_writed
|
275
|
+
parent_field = "#{parent_name} (#{parent_code})"
|
276
|
+
parent_writed = true
|
277
|
+
else
|
278
|
+
parent_field = ""
|
279
|
+
end
|
280
|
+
csv << [parent_field, "#{child_name} (#{child_code})"]
|
281
|
+
end
|
282
|
+
end
|
283
|
+
end
|
284
|
+
csv << ["", ""]
|
285
|
+
end
|
286
|
+
end
|
287
|
+
summary_stats << ['Percentage of defined HPOs that have more specific childs', (parent_hpo_count.fdiv(hpo_count) * 100).round(4)]
|
288
|
+
end
|
289
|
+
|
290
|
+
##########################
|
291
|
+
#OPT-PARSER
|
292
|
+
##########################
|
293
|
+
|
294
|
+
options = {}
|
295
|
+
OptionParser.new do |opts|
|
296
|
+
opts.banner = "Usage: #{__FILE__} [options]"
|
297
|
+
|
298
|
+
options[:coverage_analysis] = true
|
299
|
+
opts.on("-a", "--coverage_analysis", "Deactivate genome coverage analysis. Default true") do
|
300
|
+
options[:coverage_analysis] = false
|
301
|
+
end
|
302
|
+
|
303
|
+
options[:bin_size] = 50000
|
304
|
+
opts.on("-b", "--bin_size INTEGER", "Maximum number of bins to plot the coverage") do |data|
|
305
|
+
options[:bin_size] = data.to_i
|
306
|
+
end
|
307
|
+
|
308
|
+
options[:clusters2show_detailed_phen_data] = 3
|
309
|
+
opts.on("-C", "--clusters2show INTEGER", "How many patient clusters are show in detailed phenotype cluster data section. Default 3") do |data|
|
310
|
+
options[:clusters2show_detailed_phen_data] = data.to_i
|
311
|
+
end
|
312
|
+
|
313
|
+
options[:chromosome_col] = nil
|
314
|
+
opts.on("-c", "--chromosome_col INTEGER/STRING", "Column name if header is true, otherwise 0-based position of the column with the chromosome") do |data|
|
315
|
+
options[:chromosome_col] = data
|
316
|
+
end
|
317
|
+
|
318
|
+
options[:pat_id_col] = nil
|
319
|
+
opts.on("-d", "--pat_id_col INTEGER/STRING", "Column name if header is true, otherwise 0-based position of the column with the patient id") do |data|
|
320
|
+
options[:pat_id_col] = data
|
321
|
+
end
|
322
|
+
|
323
|
+
options[:excluded_hpo] = nil
|
324
|
+
opts.on("-E", "--excluded_hpo PATH", "List of HPO phenotypes to exclude (low informative)") do |excluded_hpo|
|
325
|
+
options[:excluded_hpo] = excluded_hpo
|
326
|
+
end
|
327
|
+
|
328
|
+
options[:end_col] = nil
|
329
|
+
opts.on("-e", "--end_col INTEGER/STRING", "Column name if header is true, otherwise 0-based position of the column with the end mutation coordinate") do |data|
|
330
|
+
options[:end_col] = data
|
331
|
+
end
|
332
|
+
|
333
|
+
options[:patients_filter] = 2
|
334
|
+
opts.on("-f", "--patients_filter INTEGER", "Minimum number of patients sharing SORs. Default 0") do |data|
|
335
|
+
options[:patients_filter] = data.to_i
|
336
|
+
end
|
337
|
+
|
338
|
+
options[:clusters2graph] = 30
|
339
|
+
opts.on("-g", "--clusters2graph INTEGER", "How may patient clusters are plotted in cluster plots. Default 30") do |data|
|
340
|
+
options[:clusters2graph] = data.to_i
|
341
|
+
end
|
342
|
+
|
343
|
+
options[:header] = true
|
344
|
+
#chr\tstart\tstop
|
345
|
+
opts.on("-H", "--header", "Set if the file has a line header. Default true") do
|
346
|
+
options[:header] = false
|
347
|
+
end
|
348
|
+
|
349
|
+
options[:input_file] = nil
|
350
|
+
opts.on("-i", "--input_file PATH", "Input file with patient data") do |data|
|
351
|
+
options[:input_file] = data
|
352
|
+
end
|
353
|
+
|
354
|
+
options[:hpo_names] = false
|
355
|
+
opts.on("-n", "--hpo_names", "Define if the input HPO are human readable names. Default false") do
|
356
|
+
options[:hpo_names] = true
|
357
|
+
end
|
358
|
+
|
359
|
+
options[:output_file] = nil
|
360
|
+
opts.on("-o", "--output_file PATH", "Output file with patient data") do |data|
|
361
|
+
options[:output_file] = data
|
362
|
+
end
|
363
|
+
|
364
|
+
options[:hpo_file] = nil
|
365
|
+
opts.on("-P", "--hpo_file PATH", "Input HPO file for extracting HPO codes") do |value|
|
366
|
+
options[:hpo_file] = value
|
367
|
+
end
|
368
|
+
|
369
|
+
options[:hpo_col] = nil
|
370
|
+
opts.on("-p", "--hpo_term_col INTEGER/STRING", "Column name if header true or 0-based position of the column with the HPO terms") do |data|
|
371
|
+
options[:hpo_col] = data
|
372
|
+
end
|
373
|
+
|
374
|
+
options[:hpo_separator] = '|'
|
375
|
+
opts.on("-S", "--hpo_separator STRING", "Set which character must be used to split the HPO profile. Default '|'") do |data|
|
376
|
+
options[:hpo_separator] = data
|
377
|
+
end
|
378
|
+
|
379
|
+
options[:start_col] = nil
|
380
|
+
opts.on("-s", "--start_col INTEGER/STRING", "Column name if header is true, otherwise 0-based position of the column with the start mutation coordinate") do |data|
|
381
|
+
options[:start_col] = data
|
382
|
+
end
|
383
|
+
|
384
|
+
options[:ic_stats] = false
|
385
|
+
opts.on("-t", "--ic_stats", "Use internal IC stats. Default false") do
|
386
|
+
options[:ic_stats] = true
|
387
|
+
end
|
388
|
+
|
389
|
+
opts.on_tail("-h", "--help", "Show this message") do
|
390
|
+
puts opts
|
391
|
+
exit
|
392
|
+
end
|
393
|
+
|
394
|
+
end.parse!
|
395
|
+
|
396
|
+
|
397
|
+
##########################
|
398
|
+
#MAIN
|
399
|
+
##########################
|
400
|
+
output_folder = File.dirname(options[:output_file])
|
401
|
+
detailed_profile_evaluation_file = File.join(output_folder, 'detailed_hpo_profile_evaluation.csv')
|
402
|
+
temp_folder = File.join(output_folder, 'temp')
|
403
|
+
matrix_file = File.join(temp_folder, 'pat_hpo_matrix.txt')
|
404
|
+
clustered_patients_file = File.join(temp_folder, 'cluster_asignation')
|
405
|
+
cluster_ic_data_file = File.join(temp_folder, 'cluster_ic_data.txt')
|
406
|
+
cluster_chromosome_data_file = File.join(temp_folder, 'cluster_chromosome_data.txt')
|
407
|
+
coverage_to_plot_file = File.join(temp_folder, 'coverage_data.txt')
|
408
|
+
sor_coverage_to_plot_file = File.join(temp_folder, 'sor_coverage_data.txt')
|
409
|
+
# cnvs_lenght_to_plot_file = File.join(temp_folder, 'cnvs_lenght.txt')
|
410
|
+
Dir.mkdir(temp_folder) if !File.exists?(temp_folder)
|
411
|
+
|
412
|
+
# LOAD HPO DATA
|
413
|
+
#-------------------------
|
414
|
+
|
415
|
+
# #load hpo dictionaries
|
416
|
+
hpo_black_list = []
|
417
|
+
hpo_black_list = load_hpo_black_list(options[:excluded_hpo]) if !options[:excluded_hpo].nil?
|
418
|
+
hpo_file = ENV['hpo_file']
|
419
|
+
hpo_file = HPO_FILE if hpo_file.nil?
|
420
|
+
hpo_storage = load_hpo_file(hpo_file, hpo_black_list)
|
421
|
+
hpo_parent_child_relations = get_child_parent_relations(hpo_storage)
|
422
|
+
name2code_dictionary = create_hpo_dictionary(hpo_storage) if options[:hpo_names]
|
423
|
+
|
424
|
+
patient_data, $patient_number = load_patient_cohort(options)
|
425
|
+
cohort_hpos, suggested_childs, rejected_hpos = format_patient_data(patient_data, options, name2code_dictionary, hpo_storage, hpo_parent_child_relations)
|
426
|
+
pat_hpo_matrix = generate_patient_hpo_matrix(patient_data, cohort_hpos)
|
427
|
+
write_matrix_for_R(pat_hpo_matrix, cohort_hpos, patient_data.keys, matrix_file)
|
428
|
+
|
429
|
+
system("get_clusters.R #{matrix_file} #{temp_folder}") if !File.exists?(clustered_patients_file)
|
430
|
+
clustered_patients = load_clustered_patients(clustered_patients_file)
|
431
|
+
all_ics, cluster_data_by_chromosomes, top_cluster_phenotypes, multi_chromosome_patients = process_clustered_patients(options, clustered_patients, patient_data)
|
432
|
+
write_cluster_ic_data(all_ics, cluster_ic_data_file, options[:clusters2graph])
|
433
|
+
system("plot_boxplot.R #{cluster_ic_data_file} #{temp_folder} cluster_id ic 'Cluster size/id' 'Information coefficient'")
|
434
|
+
all_hpo_profiles = get_hpo_profile(patient_data)
|
435
|
+
translate_hpo_codes2names(all_hpo_profiles, hpo_storage)
|
436
|
+
summary_stats = get_summary_stats(patient_data, cohort_hpos, all_hpo_profiles)
|
437
|
+
write_detailed_hpo_profile_evaluation(suggested_childs, detailed_profile_evaluation_file, summary_stats)
|
438
|
+
hpo_stats = hpo_stats(all_hpo_profiles)
|
439
|
+
summary_stats << ['Number of unknown phenotypes', rejected_hpos.length]
|
440
|
+
|
441
|
+
all_cnvs_length = []
|
442
|
+
if !options[:chromosome_col].nil?
|
443
|
+
summary_stats << ['Number of clusters with mutations accross > 1 chromosomes', multi_chromosome_patients]
|
444
|
+
write_cluster_chromosome_data(cluster_data_by_chromosomes, cluster_chromosome_data_file, options[:clusters2graph])
|
445
|
+
system("plot_scatterplot.R #{cluster_chromosome_data_file} #{temp_folder} cluster_id chr count 'Cluster size/id' 'Chromosome' 'Patients'")
|
446
|
+
|
447
|
+
#----------------------------------
|
448
|
+
#Prepare data to plot coverage
|
449
|
+
if options[:coverage_analysis]
|
450
|
+
processed_patient_data = process_patient_data(patient_data)
|
451
|
+
cnv_sizes = []
|
452
|
+
processed_patient_data.each do |chr, metadata|
|
453
|
+
metadata.each do |patientID, start, stop|
|
454
|
+
cnv_sizes << stop - start
|
455
|
+
end
|
456
|
+
end
|
457
|
+
cnv_size_average = cnv_sizes.inject{ |sum, el| sum + el }.fdiv(cnv_sizes.length.to_f)
|
458
|
+
patients_by_cluster, sors = generate_cluster_regions(processed_patient_data, 'A', 0)
|
459
|
+
total_patients_sharing_sors = []
|
460
|
+
all_patients = patients_by_cluster.keys
|
461
|
+
all_patients.each do |identifier|
|
462
|
+
total_patients_sharing_sors << identifier.split('_i').first
|
463
|
+
end
|
464
|
+
all_cnvs_length = get_cnvs_length(patient_data)
|
465
|
+
|
466
|
+
###1. Process CNVs
|
467
|
+
raw_coverage, n_cnv, nt, pats_per_region = calculate_coverage(sors)
|
468
|
+
summary_stats << ['Number of genome windows', n_cnv]
|
469
|
+
summary_stats << ['Nucleotides affected by mutations', nt]
|
470
|
+
summary_stats << ['Patient average per region', pats_per_region.round(4)]
|
471
|
+
summary_stats << ['CNV size average', cnv_size_average.round(4)]
|
472
|
+
coverage_to_plot = get_final_coverage(raw_coverage, options[:bin_size])
|
473
|
+
write_coverage_data(coverage_to_plot, coverage_to_plot_file)
|
474
|
+
cmd = "plot_area.R -d #{coverage_to_plot_file} -o #{temp_folder}/coverage_plot -x V2 -y V3 -f V1 -H -m #{CHR_SIZE} -t CNV"
|
475
|
+
system(cmd)
|
476
|
+
|
477
|
+
###2. Process SORs
|
478
|
+
raw_sor_coverage, n_sor, nt, pats_per_region = calculate_coverage(sors, options[:patients_filter] - 1)
|
479
|
+
summary_stats << ["Number of patients with at least 1 SOR", total_patients_sharing_sors.uniq.length]
|
480
|
+
summary_stats << ["Number of SORs with >= #{options[:patients_filter]} patients", n_sor]
|
481
|
+
summary_stats << ['Nucleotides affected by mutations', nt]
|
482
|
+
# summary_stats << ['Patient average per region', pats_per_region]
|
483
|
+
sor_coverage_to_plot = get_final_coverage(raw_sor_coverage, options[:bin_size])
|
484
|
+
write_coverage_data(sor_coverage_to_plot, sor_coverage_to_plot_file)
|
485
|
+
system("plot_area.R -d #{sor_coverage_to_plot_file} -o #{temp_folder}/sor_coverage_plot -x V2 -y V3 -f V1 -H -m #{CHR_SIZE} -t SOR")
|
486
|
+
all_sor_length = get_sor_length_distribution(raw_sor_coverage)
|
487
|
+
end
|
488
|
+
end
|
489
|
+
#----------------------------------
|
490
|
+
#Report
|
491
|
+
total_patients = 0
|
492
|
+
new_cluster_phenotypes = {}
|
493
|
+
phenotypes_frequency = Hash.new(0)
|
494
|
+
top_cluster_phenotypes.each_with_index do |cluster, clusterID|
|
495
|
+
total_patients = cluster.length
|
496
|
+
cluster.each do |phenotypes|
|
497
|
+
phenotypes.each do |p|
|
498
|
+
phenotypes_frequency[p] += 1
|
499
|
+
end
|
500
|
+
end
|
501
|
+
new_cluster_phenotypes[clusterID] = [total_patients, phenotypes_frequency.keys, phenotypes_frequency.values.map{|v| v.fdiv(total_patients) * 100}]
|
502
|
+
phenotypes_frequency = Hash.new(0)
|
503
|
+
end
|
504
|
+
|
505
|
+
container = {
|
506
|
+
:temp_folder => temp_folder,
|
507
|
+
# :top_cluster_phenotypes => top_cluster_phenotypes.length,
|
508
|
+
:summary_stats => summary_stats,
|
509
|
+
:hpo_stats => hpo_stats,
|
510
|
+
:all_cnvs_length => all_cnvs_length,
|
511
|
+
:all_sor_length => all_sor_length,
|
512
|
+
:new_cluster_phenotypes => new_cluster_phenotypes.keys.length
|
513
|
+
}
|
514
|
+
# top_cluster_phenotypes.each_with_index do |cluster, i|
|
515
|
+
# clust_pr = cluster.map{|pr| [pr.join(', ')] }
|
516
|
+
# container["clust_#{i}"] = clust_pr
|
517
|
+
# end
|
518
|
+
|
519
|
+
clust_info = []
|
520
|
+
new_cluster_phenotypes.each do |clusterID, info|
|
521
|
+
phens = info[1].join(', ')
|
522
|
+
freqs = info[2].map{|a| a.round(4)}.join(', ')
|
523
|
+
clust_info << [info[0], phens, freqs]
|
524
|
+
container["clust_#{clusterID}"] = clust_info
|
525
|
+
clust_info = []
|
526
|
+
end
|
527
|
+
|
528
|
+
template = File.open(File.join(REPORT_FOLDER, 'cohort_report.erb')).read
|
529
|
+
report = Report_html.new(container, 'Cohort quality report')
|
530
|
+
report.build(template)
|
531
|
+
report.write(options[:output_file]+'.html')
|