pets 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +11 -0
- data/.rspec +3 -0
- data/.travis.yml +7 -0
- data/Gemfile +4 -0
- data/LICENSE.txt +21 -0
- data/README.md +41 -0
- data/Rakefile +6 -0
- data/bin/area_under_curve_pr.rb +118 -0
- data/bin/association_metrics_average.rb +94 -0
- data/bin/coPatReporter.rb +531 -0
- data/bin/console +14 -0
- data/bin/fmeasure_index.rb +72 -0
- data/bin/get_PR_values.rb +90 -0
- data/bin/get_clusters.R +18 -0
- data/bin/get_network_nodes.rb +197 -0
- data/bin/lines.R +77 -0
- data/bin/merge_by_cluster.rb +62 -0
- data/bin/merge_pairs.rb +138 -0
- data/bin/paco_translator.rb +102 -0
- data/bin/phen2reg.rb +385 -0
- data/bin/phen2reg_predictor_check.rb +297 -0
- data/bin/plot_area.R +71 -0
- data/bin/plot_boxplot.R +21 -0
- data/bin/plot_density.R +46 -0
- data/bin/plot_scatterplot.R +25 -0
- data/bin/reg2phen.rb +116 -0
- data/bin/region_to_patients_generator.rb +84 -0
- data/bin/relate_CI_to_association_value.rb +90 -0
- data/bin/setup +8 -0
- data/bin/standardize_scores.R +40 -0
- data/bin/xyplot_graph.R +60 -0
- data/external_data/biosystems_gene.gz +0 -0
- data/external_data/bsid2info.gz +0 -0
- data/external_data/chromosome_sizes_hg19.txt +24 -0
- data/external_data/gene_data.gz +0 -0
- data/external_data/gene_data_with_pathways.gz +0 -0
- data/external_data/gene_location.gz +0 -0
- data/external_data/hp.obo +146363 -0
- data/external_data/remove +0 -0
- data/lib/pets.rb +6 -0
- data/lib/pets/coPatReporterMethods.rb +77 -0
- data/lib/pets/generalMethods.rb +556 -0
- data/lib/pets/phen2reg_methods.rb +432 -0
- data/lib/pets/version.rb +3 -0
- data/pets.gemspec +47 -0
- data/templates/cohort_report.erb +93 -0
- data/templates/patient_report.erb +209 -0
- metadata +183 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: a629423c3668446b726dc91d984efe15e730ed48
|
4
|
+
data.tar.gz: 11b466ef84cdf9d84354fb54f7c5371fee6fe067
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: c361cd825328b9265851eef94ae704e2bfc4c88c215b193ee10edb04e2caa93d538c66c7701d09b2ae12c018921f8d579a133705a5ab16e342e5cf961abbd370
|
7
|
+
data.tar.gz: 8acfa703181a2787f8e50c36b0dd2eb8704b7f3f10f6b45e2a7777e3a7c144182c94ffad57a919efb51c6c22817fff436c1b549f6eee5e7fc5ea07853f6a5512
|
data/.gitignore
ADDED
data/.rspec
ADDED
data/.travis.yml
ADDED
data/Gemfile
ADDED
data/LICENSE.txt
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
The MIT License (MIT)
|
2
|
+
|
3
|
+
Copyright (c) 2019 elenarojano
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
7
|
+
in the Software without restriction, including without limitation the rights
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
10
|
+
furnished to do so, subject to the following conditions:
|
11
|
+
|
12
|
+
The above copyright notice and this permission notice shall be included in
|
13
|
+
all copies or substantial portions of the Software.
|
14
|
+
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
21
|
+
THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,41 @@
|
|
1
|
+
# Gephepred
|
2
|
+
|
3
|
+
Welcome to your new gem! In this directory, you'll find the files you need to be able to package up your Ruby library into a gem. Put your Ruby code in the file `lib/gephepred`. To experiment with that code, run `bin/console` for an interactive prompt.
|
4
|
+
|
5
|
+
TODO: Delete this and the text above, and describe your gem
|
6
|
+
|
7
|
+
## Installation
|
8
|
+
|
9
|
+
Add this line to your application's Gemfile:
|
10
|
+
|
11
|
+
```ruby
|
12
|
+
gem 'gephepred'
|
13
|
+
```
|
14
|
+
|
15
|
+
And then execute:
|
16
|
+
|
17
|
+
$ bundle
|
18
|
+
|
19
|
+
Or install it yourself as:
|
20
|
+
|
21
|
+
$ gem install gephepred
|
22
|
+
|
23
|
+
## Usage
|
24
|
+
|
25
|
+
TODO: Write usage instructions here
|
26
|
+
|
27
|
+
## Development
|
28
|
+
|
29
|
+
After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake spec` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
|
30
|
+
|
31
|
+
To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release`, which will create a git tag for the version, push git commits and tags, and push the `.gem` file to [rubygems.org](https://rubygems.org).
|
32
|
+
|
33
|
+
## Contributing
|
34
|
+
|
35
|
+
Bug reports and pull requests are welcome on GitHub at https://github.com/[USERNAME]/gephepred.
|
36
|
+
|
37
|
+
|
38
|
+
## License
|
39
|
+
|
40
|
+
The gem is available as open source under the terms of the [MIT License](http://opensource.org/licenses/MIT).
|
41
|
+
|
data/Rakefile
ADDED
@@ -0,0 +1,118 @@
|
|
1
|
+
#! /usr/bin/env ruby
|
2
|
+
|
3
|
+
#Tool for calculate the AUC on PR curves.
|
4
|
+
|
5
|
+
##########################
|
6
|
+
#LIBRARIES
|
7
|
+
##########################
|
8
|
+
|
9
|
+
require 'optparse'
|
10
|
+
|
11
|
+
##########################
|
12
|
+
#METHODS
|
13
|
+
##########################
|
14
|
+
|
15
|
+
def load_file(input_file, x_val_col, y_val_col)
|
16
|
+
coordinates = []
|
17
|
+
File.open(input_file).each do |line|
|
18
|
+
line.chomp!
|
19
|
+
next if line.include?('prec') || line.include?('rec')
|
20
|
+
info = line.split("\t")
|
21
|
+
x_value = info[x_val_col - 1].to_f
|
22
|
+
y_value = info[y_val_col - 1].to_f
|
23
|
+
#STDERR.puts y_value
|
24
|
+
coordinates << [x_value, y_value]
|
25
|
+
end
|
26
|
+
return coordinates.sort{|r1, r2| r1[0] <=> r2[0]}
|
27
|
+
end
|
28
|
+
|
29
|
+
|
30
|
+
def calculate_auc(pr_values)
|
31
|
+
#pr_values = [[x, y], [x', y']...]
|
32
|
+
x_val = 0
|
33
|
+
y_val = 0
|
34
|
+
total_area = 0
|
35
|
+
pr_values.each_with_index do |xy_pair, counter|
|
36
|
+
if counter != 0
|
37
|
+
current_x = xy_pair[0]
|
38
|
+
current_y = xy_pair[1]
|
39
|
+
#puts x_val
|
40
|
+
total_area += (x_val - current_x).abs * current_y
|
41
|
+
#STDERR.puts total_square_area
|
42
|
+
total_area += (x_val - current_x).abs * (y_val - current_y).abs / 2
|
43
|
+
x_val = current_x
|
44
|
+
y_val = current_y
|
45
|
+
else
|
46
|
+
x_val = xy_pair[0]
|
47
|
+
y_val = xy_pair[1]
|
48
|
+
total_area += x_val * y_val
|
49
|
+
#STDERR.puts total_area
|
50
|
+
end
|
51
|
+
#STDERR.puts total_area
|
52
|
+
end
|
53
|
+
#STDERR.puts total_area
|
54
|
+
return total_area
|
55
|
+
end
|
56
|
+
|
57
|
+
# def calculate_auc(pr_values)
|
58
|
+
# #pr_values = [[x, y], [x', y']...]
|
59
|
+
# counter = 0
|
60
|
+
# x_val = 0
|
61
|
+
# y_val = 0
|
62
|
+
# total_square_area = 0
|
63
|
+
# total_triangle_area = 0
|
64
|
+
# pr_values.each do |xy_pair|
|
65
|
+
# if counter != 0
|
66
|
+
# x_prime = xy_pair[0]
|
67
|
+
# y_prime = xy_pair[1]
|
68
|
+
# #puts "#{x_prime}\t#{x_val}"
|
69
|
+
# #puts "#{y_prime}\t#{y_val}"
|
70
|
+
# total_square_area += (x_val - x_prime) * y_prime
|
71
|
+
# total_triangle_area += (x_val - x_prime) * (y_prime - y_val) / 2
|
72
|
+
# x_val = x_prime
|
73
|
+
# y_val = y_prime
|
74
|
+
# else
|
75
|
+
# x_val = xy_pair[0]
|
76
|
+
# y_val = xy_pair[1]
|
77
|
+
# counter += 1
|
78
|
+
# end
|
79
|
+
# end
|
80
|
+
# total_area = total_square_area + total_triangle_area
|
81
|
+
# STDERR.puts total_area
|
82
|
+
# return total_area
|
83
|
+
# end
|
84
|
+
|
85
|
+
|
86
|
+
##########################
|
87
|
+
#OPT-PARSE
|
88
|
+
##########################
|
89
|
+
|
90
|
+
options = {}
|
91
|
+
OptionParser.new do |opts|
|
92
|
+
opts.banner = "Usage: #{__FILE__} [options]"
|
93
|
+
|
94
|
+
options[:input_file] = nil
|
95
|
+
opts.on("-f", "--input_file PATH", "Precision-recall values file") do |input_file|
|
96
|
+
options[:input_file] = input_file
|
97
|
+
end
|
98
|
+
|
99
|
+
options[:x_values] = nil
|
100
|
+
opts.on("-x", "--:x_values INTEGER", "Set column for extracting x values") do |x_values|
|
101
|
+
options[:x_values] = x_values.to_i
|
102
|
+
end
|
103
|
+
|
104
|
+
options[:y_values] = nil
|
105
|
+
opts.on("-y", "--:y_values INTEGER", "Set column for extracting y values") do |y_values|
|
106
|
+
options[:y_values] = y_values.to_i
|
107
|
+
end
|
108
|
+
|
109
|
+
end.parse!
|
110
|
+
|
111
|
+
##########################
|
112
|
+
#MAIN
|
113
|
+
##########################
|
114
|
+
|
115
|
+
pr_values = load_file(options[:input_file], options[:x_values], options[:y_values])
|
116
|
+
#puts pr_values
|
117
|
+
final_area = calculate_auc(pr_values)
|
118
|
+
puts final_area
|
@@ -0,0 +1,94 @@
|
|
1
|
+
#! /usr/bin/env ruby
|
2
|
+
#Tool for calculating averages between different association values file.
|
3
|
+
#File structure: prec rec cut meth
|
4
|
+
#Load all files (7) stored in the same directory and calculate average;
|
5
|
+
#of lines for each method. Return a file with the same structure;
|
6
|
+
#giving name as "average" to the last column
|
7
|
+
|
8
|
+
require 'optparse'
|
9
|
+
|
10
|
+
##########################
|
11
|
+
#METHODS
|
12
|
+
##########################
|
13
|
+
|
14
|
+
def load_association_file(filename)
|
15
|
+
fileInfo = []
|
16
|
+
header = ''
|
17
|
+
line_number = 0
|
18
|
+
File.open(filename).each do |line|
|
19
|
+
line.chomp!
|
20
|
+
if line_number == 0
|
21
|
+
header = line
|
22
|
+
else
|
23
|
+
cut, precision, recall, meth = line.split("\t")
|
24
|
+
fileInfo << [cut.to_f, precision.to_f, recall.to_f, meth]
|
25
|
+
end
|
26
|
+
line_number += 1
|
27
|
+
end
|
28
|
+
return fileInfo, header
|
29
|
+
end
|
30
|
+
|
31
|
+
def calculate_average(all_files, cols_for_average)
|
32
|
+
average = []
|
33
|
+
n_files = all_files.length.to_f
|
34
|
+
ref_file = all_files.shift
|
35
|
+
summatory_file = []
|
36
|
+
ref_file.each_with_index do |line, i|
|
37
|
+
all_files.each do |file|
|
38
|
+
line2 = file[i]
|
39
|
+
cols_for_average.each do |col|
|
40
|
+
line[col] = line[col] + line2[col]
|
41
|
+
end
|
42
|
+
end
|
43
|
+
summatory_file << line
|
44
|
+
end
|
45
|
+
summatory_file.each do |line|
|
46
|
+
cols_for_average.each do |col|
|
47
|
+
line[col] = line[col]/n_files
|
48
|
+
end
|
49
|
+
average << line
|
50
|
+
end
|
51
|
+
return average
|
52
|
+
end
|
53
|
+
|
54
|
+
|
55
|
+
##########################
|
56
|
+
#OPT-PARSER
|
57
|
+
##########################
|
58
|
+
|
59
|
+
options = {}
|
60
|
+
OptionParser.new do |opts|
|
61
|
+
opts.banner = "Usage: #{__FILE__} [options]"
|
62
|
+
|
63
|
+
options[:file_names] = nil
|
64
|
+
opts.on("-f", "--file_names STRING", "Input file names to calculate averages. Please separate names by commas") do |file_names|
|
65
|
+
options[:file_names] = file_names.split(',')
|
66
|
+
end
|
67
|
+
|
68
|
+
options[:which_cols] = nil
|
69
|
+
opts.on("-c", "--which_cols STRING", "Cols for performing average analysis") do |which_cols|
|
70
|
+
options[:which_cols] = which_cols.split(',').map{|i| i.to_i - 1}
|
71
|
+
end
|
72
|
+
|
73
|
+
end.parse!
|
74
|
+
|
75
|
+
##########################
|
76
|
+
#MAIN
|
77
|
+
##########################
|
78
|
+
|
79
|
+
all_files = []
|
80
|
+
header = nil
|
81
|
+
options[:file_names].each do |filename|
|
82
|
+
file, header = load_association_file(filename)
|
83
|
+
all_files << file
|
84
|
+
end
|
85
|
+
|
86
|
+
average = calculate_average(all_files, options[:which_cols])
|
87
|
+
|
88
|
+
puts header
|
89
|
+
average.each do |line|
|
90
|
+
puts line.join("\t")
|
91
|
+
end
|
92
|
+
|
93
|
+
|
94
|
+
|
@@ -0,0 +1,531 @@
|
|
1
|
+
#! /usr/bin/env ruby
|
2
|
+
|
3
|
+
ROOT_PATH = File.dirname(__FILE__)
|
4
|
+
REPORT_FOLDER = File.expand_path(File.join(ROOT_PATH, '..', 'templates'))
|
5
|
+
EXTERNAL_DATA = File.expand_path(File.join(ROOT_PATH, '..', 'external_data'))
|
6
|
+
HPO_FILE = File.join(EXTERNAL_DATA, 'hp.obo')
|
7
|
+
IC_FILE = File.join(EXTERNAL_DATA, 'uniq_hpo_with_CI.txt')
|
8
|
+
CHR_SIZE = File.join(EXTERNAL_DATA, 'chromosome_sizes_hg19.txt')
|
9
|
+
$: << File.expand_path(File.join(ROOT_PATH, '..', 'lib', 'gephepred'))
|
10
|
+
|
11
|
+
require 'optparse'
|
12
|
+
require 'csv'
|
13
|
+
require 'generalMethods.rb'
|
14
|
+
require 'coPatReporterMethods.rb'
|
15
|
+
require 'report_html'
|
16
|
+
|
17
|
+
##########################
|
18
|
+
#METHODS
|
19
|
+
##########################
|
20
|
+
HPOS = 0
|
21
|
+
CHR = 1
|
22
|
+
START = 2
|
23
|
+
STOP = 3
|
24
|
+
|
25
|
+
def format_patient_data(patient_data, options, name2code_dictionary, hpo_storage, hpo_parent_child_relations)
|
26
|
+
all_hpo = []
|
27
|
+
rejected_hpos = []
|
28
|
+
suggested_childs = {}
|
29
|
+
patient_data.each do |pat_id, patient_record|
|
30
|
+
string_hpos, chr, start, stop = patient_record
|
31
|
+
hpos = string_hpos.split(options[:hpo_separator])
|
32
|
+
translate_hpo_names2codes(hpos, name2code_dictionary, pat_id, rejected_hpos) if options[:hpo_names]
|
33
|
+
suggested_childs[pat_id] = check_hpo_codes(hpos, hpo_storage, hpo_parent_child_relations, pat_id, rejected_hpos)
|
34
|
+
all_hpo.concat(hpos)
|
35
|
+
patient_record[HPOS] = hpos
|
36
|
+
patient_record[START] = start.to_i if !start.nil?
|
37
|
+
patient_record[STOP] = stop.to_i if !stop.nil?
|
38
|
+
end
|
39
|
+
return all_hpo.uniq, suggested_childs, rejected_hpos.uniq
|
40
|
+
end
|
41
|
+
|
42
|
+
def translate_hpo_names2codes(hpos, hpo_dictionary, pat_id, rejected_hpos)
|
43
|
+
hpo_codes = []
|
44
|
+
hpos.each_with_index do |hpo_name, i|
|
45
|
+
hpo_code = hpo_dictionary[hpo_name]
|
46
|
+
if hpo_code.nil?
|
47
|
+
STDERR.puts "WARNING: patient #{pat_id} has the unknown hpo NAME '#{hpo_name}'. Rejected."
|
48
|
+
rejected_hpos << hpo_name
|
49
|
+
else
|
50
|
+
hpo_codes << hpo_code
|
51
|
+
end
|
52
|
+
end
|
53
|
+
hpos.clear
|
54
|
+
hpos.concat(hpo_codes)
|
55
|
+
end
|
56
|
+
|
57
|
+
def check_hpo_codes(hpos, hpo_storage, hpo_parent_child_relations, pat_id, rejected_hpos)
|
58
|
+
more_specific_hpo = []
|
59
|
+
hpos.each_with_index do |hpo_code, i|
|
60
|
+
hpo_data = hpo_storage[hpo_code]
|
61
|
+
if hpo_data.nil?
|
62
|
+
hpos[i] = nil
|
63
|
+
STDERR.puts "WARNING: patient #{pat_id} has the unknown hpo CODE '#{hpo_code}'. Rejected."
|
64
|
+
rejected_hpos << hpo_code
|
65
|
+
else
|
66
|
+
main_hpo_code, name = hpo_data
|
67
|
+
hpos[i] = main_hpo_code # change from alternate hpo codes to the main ones
|
68
|
+
childs = hpo_parent_child_relations[main_hpo_code]
|
69
|
+
if childs.nil?
|
70
|
+
specific_childs = []
|
71
|
+
else
|
72
|
+
specific_childs = childs
|
73
|
+
end
|
74
|
+
more_specific_hpo << [[main_hpo_code, name], specific_childs]
|
75
|
+
end
|
76
|
+
end
|
77
|
+
hpos.compact!
|
78
|
+
return more_specific_hpo
|
79
|
+
end
|
80
|
+
|
81
|
+
def generate_patient_hpo_matrix(patient_data, cohort_hpos)
|
82
|
+
matrix = []
|
83
|
+
n = cohort_hpos.length
|
84
|
+
patient_data.each do |pat_id, patient_record|
|
85
|
+
pat_hpos = patient_record[HPOS]
|
86
|
+
vector = Array.new(n, 0)
|
87
|
+
pat_hpos.each do |hpo|
|
88
|
+
vector[cohort_hpos.index(hpo)] = 1
|
89
|
+
end
|
90
|
+
matrix << vector
|
91
|
+
end
|
92
|
+
return matrix
|
93
|
+
end
|
94
|
+
|
95
|
+
def write_matrix_for_R(matrix, x_names, y_names, file)
|
96
|
+
File.open(file, 'w') do |f|
|
97
|
+
f.puts x_names.join("\t")
|
98
|
+
matrix.each_with_index do |row, i|
|
99
|
+
f.puts [y_names[i]].concat(row).join("\t")
|
100
|
+
end
|
101
|
+
end
|
102
|
+
end
|
103
|
+
|
104
|
+
def process_clustered_patients(options, clustered_patients, patient_data) # get ic and chromosomes
|
105
|
+
if options[:ic_stats]
|
106
|
+
ic_file = ENV['ic_file']
|
107
|
+
ic_file = IC_FILE if ic_file.nil?
|
108
|
+
phenotype_ic = load_hpo_ci_values(ic_file)
|
109
|
+
else
|
110
|
+
phenotype_ic = compute_IC_values(patient_data, $patient_number)
|
111
|
+
end
|
112
|
+
all_ics = []
|
113
|
+
top_cluster_phenotypes = []
|
114
|
+
cluster_data_by_chromosomes = []
|
115
|
+
multi_chromosome_patients = 0
|
116
|
+
processed_clusters = 0
|
117
|
+
clustered_patients.sort_by{|cl_id, pat_ids| pat_ids.length }.reverse.each do |cluster_id, patient_ids|
|
118
|
+
num_of_patients = patient_ids.length
|
119
|
+
next if num_of_patients == 1
|
120
|
+
chrs = Hash.new(0)
|
121
|
+
all_phens = []
|
122
|
+
profile_ics = []
|
123
|
+
patient_ids.each do |pat_id|
|
124
|
+
patient = patient_data[pat_id]
|
125
|
+
phenotypes = patient[HPOS]
|
126
|
+
profile_ics << get_profile_ic(phenotypes, phenotype_ic)
|
127
|
+
#optional
|
128
|
+
all_phens << phenotypes if processed_clusters < options[:clusters2show_detailed_phen_data]
|
129
|
+
chrs[patient[CHR]] += 1 if !options[:chromosome_col].nil?
|
130
|
+
end
|
131
|
+
top_cluster_phenotypes << all_phens if processed_clusters < options[:clusters2show_detailed_phen_data]
|
132
|
+
all_ics << profile_ics
|
133
|
+
# STDERR.puts [cluster_id, num_of_patients, chr, count].inspect
|
134
|
+
if !options[:chromosome_col].nil?
|
135
|
+
multi_chromosome_patients += num_of_patients if chrs.length > 1
|
136
|
+
chrs.each do |chr, count|
|
137
|
+
cluster_data_by_chromosomes << [cluster_id, num_of_patients, chr, count]
|
138
|
+
end
|
139
|
+
end
|
140
|
+
processed_clusters += 1
|
141
|
+
end
|
142
|
+
# STDERR.puts cluster_data_by_chromosomes.inspect
|
143
|
+
return all_ics, cluster_data_by_chromosomes, top_cluster_phenotypes, multi_chromosome_patients
|
144
|
+
end
|
145
|
+
|
146
|
+
def get_profile_ic(hpo_names, phenotype_ic)
|
147
|
+
ic = 0
|
148
|
+
profile_length = 0
|
149
|
+
hpo_names.each do |hpo_id|
|
150
|
+
hpo_ic = phenotype_ic[hpo_id]
|
151
|
+
# STDERR.puts phenotype_ic.inspect
|
152
|
+
ic += hpo_ic if !hpo_ic.nil?
|
153
|
+
profile_length += 1
|
154
|
+
end
|
155
|
+
profile_length = 1 if profile_length == 0
|
156
|
+
return ic.fdiv(profile_length)
|
157
|
+
end
|
158
|
+
|
159
|
+
def write_cluster_ic_data(all_ics, cluster_ic_data_file, limit)
|
160
|
+
File.open(cluster_ic_data_file, 'w') do |f|
|
161
|
+
f.puts %w[cluster_id ic].join("\t")
|
162
|
+
all_ics.each_with_index do |cluster_ics, i|
|
163
|
+
break if i == limit
|
164
|
+
cluster_length = cluster_ics.length
|
165
|
+
cluster_ics.each do |clust_ic|
|
166
|
+
f.puts "#{cluster_length}_#{i}\t#{clust_ic}"
|
167
|
+
end
|
168
|
+
end
|
169
|
+
end
|
170
|
+
end
|
171
|
+
|
172
|
+
def write_cluster_chromosome_data(cluster_data, cluster_chromosome_data_file, limit)
|
173
|
+
File.open(cluster_chromosome_data_file, 'w') do |f|
|
174
|
+
f.puts %w[cluster_id chr count].join("\t")
|
175
|
+
index = 0
|
176
|
+
last_id = cluster_data.first.first unless cluster_data.empty?
|
177
|
+
cluster_data.each do |cluster_id, patient_number, chr, count|
|
178
|
+
index += 1 if cluster_id != last_id
|
179
|
+
break if index == limit
|
180
|
+
f.puts ["#{patient_number}_#{index}", chr, count].join("\t")
|
181
|
+
last_id = cluster_id
|
182
|
+
end
|
183
|
+
end
|
184
|
+
end
|
185
|
+
|
186
|
+
def write_coverage_data(coverage_to_plot, coverage_to_plot_file)
|
187
|
+
File.open(coverage_to_plot_file, 'w') do |f|
|
188
|
+
coverage_to_plot.each do |chr, position, freq|
|
189
|
+
f.puts "#{chr}\t#{position}\t#{freq}"
|
190
|
+
end
|
191
|
+
end
|
192
|
+
end
|
193
|
+
|
194
|
+
def get_hpo_profile(patient_data)
|
195
|
+
hpo_profiles = []
|
196
|
+
patient_data.each do |pat_id, pat_data|
|
197
|
+
hpo_profiles << pat_data[HPOS]
|
198
|
+
end
|
199
|
+
return hpo_profiles
|
200
|
+
end
|
201
|
+
|
202
|
+
def get_summary_stats(patient_data, cohort_hpos, all_hpo_profiles)
|
203
|
+
stats = []
|
204
|
+
ids = []
|
205
|
+
stats << ['Unique HPOs', cohort_hpos.length]
|
206
|
+
patient_ids = patient_data.keys
|
207
|
+
patient_ids.each do |pat_id|
|
208
|
+
id, count = pat_id.split('_i')
|
209
|
+
ids << id
|
210
|
+
end
|
211
|
+
n_pat = ids.uniq.length
|
212
|
+
stats << ['Number of patients in the cohort', n_pat]
|
213
|
+
all_hpo_prof_lengths = all_hpo_profiles.map{|p| p.length}.sort
|
214
|
+
stats << ['HPOs per patient (average)', all_hpo_prof_lengths.inject(0){|sum, n| sum + n}.fdiv(n_pat).round(4)]
|
215
|
+
hpo_pat90 = nil
|
216
|
+
rate = 0
|
217
|
+
count = 0
|
218
|
+
while rate <= 0.1
|
219
|
+
hpo_pat90 = all_hpo_prof_lengths[count+1]
|
220
|
+
rate = count.fdiv(n_pat)
|
221
|
+
count += 1
|
222
|
+
end
|
223
|
+
stats << ['HPOs for patient in percentile 90', hpo_pat90]
|
224
|
+
return stats
|
225
|
+
end
|
226
|
+
|
227
|
+
def hpo_stats(all_hpo_profiles)
|
228
|
+
stats = Hash.new(0)
|
229
|
+
all_hpo_profiles.each do |profile|
|
230
|
+
profile.each do |hpo|
|
231
|
+
stats[hpo] += 1
|
232
|
+
end
|
233
|
+
end
|
234
|
+
n_profiles = all_hpo_profiles.length
|
235
|
+
hpo_stats = []
|
236
|
+
stats.each do |hpo, count|
|
237
|
+
hpo_stats << [hpo, count.fdiv(n_profiles)*100]
|
238
|
+
end
|
239
|
+
hpo_stats.sort!{|h1, h2| h2[1] <=> h1[1]}
|
240
|
+
return hpo_stats[0..20]
|
241
|
+
end
|
242
|
+
|
243
|
+
def translate_hpo_codes2names(all_hpo_profiles, hpo_storage)
|
244
|
+
all_hpo_profiles.each do |profile|
|
245
|
+
profile.each_with_index do |hpo, i|
|
246
|
+
hpo_data = hpo_storage[hpo]
|
247
|
+
if hpo_data.nil?
|
248
|
+
STDERR.puts "WARNING: hpo code '#{hpo}' not exists."
|
249
|
+
else
|
250
|
+
profile[i] = hpo_data[1]
|
251
|
+
end
|
252
|
+
end
|
253
|
+
end
|
254
|
+
end
|
255
|
+
|
256
|
+
def write_detailed_hpo_profile_evaluation(suggested_childs, detailed_profile_evaluation_file, summary_stats)
|
257
|
+
hpo_count = 0
|
258
|
+
parent_hpo_count = 0
|
259
|
+
CSV.open(detailed_profile_evaluation_file, "wb") do |csv|
|
260
|
+
suggested_childs.each do |pat_id, suggestions|
|
261
|
+
warning = nil
|
262
|
+
warning = 'WARNING: Very few phenotypes' if suggestions.length < 4
|
263
|
+
csv << ["PATIENT #{pat_id}", "#{warning}"]
|
264
|
+
csv << ["CURRENT PHENOTYPES", "PUTATIVE MORE SPECIFIC PHENOTYPES"]
|
265
|
+
suggestions.each do |parent, childs|
|
266
|
+
hpo_count += 1
|
267
|
+
parent_code, parent_name = parent
|
268
|
+
if childs.empty?
|
269
|
+
csv << ["#{parent_name} (#{parent_code})", '-']
|
270
|
+
else
|
271
|
+
parent_hpo_count += 1
|
272
|
+
parent_writed = false
|
273
|
+
childs.each do |child_code, child_name|
|
274
|
+
if !parent_writed
|
275
|
+
parent_field = "#{parent_name} (#{parent_code})"
|
276
|
+
parent_writed = true
|
277
|
+
else
|
278
|
+
parent_field = ""
|
279
|
+
end
|
280
|
+
csv << [parent_field, "#{child_name} (#{child_code})"]
|
281
|
+
end
|
282
|
+
end
|
283
|
+
end
|
284
|
+
csv << ["", ""]
|
285
|
+
end
|
286
|
+
end
|
287
|
+
summary_stats << ['Percentage of defined HPOs that have more specific childs', (parent_hpo_count.fdiv(hpo_count) * 100).round(4)]
|
288
|
+
end
|
289
|
+
|
290
|
+
##########################
|
291
|
+
#OPT-PARSER
|
292
|
+
##########################
|
293
|
+
|
294
|
+
options = {}
|
295
|
+
OptionParser.new do |opts|
|
296
|
+
opts.banner = "Usage: #{__FILE__} [options]"
|
297
|
+
|
298
|
+
options[:coverage_analysis] = true
|
299
|
+
opts.on("-a", "--coverage_analysis", "Deactivate genome coverage analysis. Default true") do
|
300
|
+
options[:coverage_analysis] = false
|
301
|
+
end
|
302
|
+
|
303
|
+
options[:bin_size] = 50000
|
304
|
+
opts.on("-b", "--bin_size INTEGER", "Maximum number of bins to plot the coverage") do |data|
|
305
|
+
options[:bin_size] = data.to_i
|
306
|
+
end
|
307
|
+
|
308
|
+
options[:clusters2show_detailed_phen_data] = 3
|
309
|
+
opts.on("-C", "--clusters2show INTEGER", "How many patient clusters are show in detailed phenotype cluster data section. Default 3") do |data|
|
310
|
+
options[:clusters2show_detailed_phen_data] = data.to_i
|
311
|
+
end
|
312
|
+
|
313
|
+
options[:chromosome_col] = nil
|
314
|
+
opts.on("-c", "--chromosome_col INTEGER/STRING", "Column name if header is true, otherwise 0-based position of the column with the chromosome") do |data|
|
315
|
+
options[:chromosome_col] = data
|
316
|
+
end
|
317
|
+
|
318
|
+
options[:pat_id_col] = nil
|
319
|
+
opts.on("-d", "--pat_id_col INTEGER/STRING", "Column name if header is true, otherwise 0-based position of the column with the patient id") do |data|
|
320
|
+
options[:pat_id_col] = data
|
321
|
+
end
|
322
|
+
|
323
|
+
options[:excluded_hpo] = nil
|
324
|
+
opts.on("-E", "--excluded_hpo PATH", "List of HPO phenotypes to exclude (low informative)") do |excluded_hpo|
|
325
|
+
options[:excluded_hpo] = excluded_hpo
|
326
|
+
end
|
327
|
+
|
328
|
+
options[:end_col] = nil
|
329
|
+
opts.on("-e", "--end_col INTEGER/STRING", "Column name if header is true, otherwise 0-based position of the column with the end mutation coordinate") do |data|
|
330
|
+
options[:end_col] = data
|
331
|
+
end
|
332
|
+
|
333
|
+
options[:patients_filter] = 2
|
334
|
+
opts.on("-f", "--patients_filter INTEGER", "Minimum number of patients sharing SORs. Default 0") do |data|
|
335
|
+
options[:patients_filter] = data.to_i
|
336
|
+
end
|
337
|
+
|
338
|
+
options[:clusters2graph] = 30
|
339
|
+
opts.on("-g", "--clusters2graph INTEGER", "How may patient clusters are plotted in cluster plots. Default 30") do |data|
|
340
|
+
options[:clusters2graph] = data.to_i
|
341
|
+
end
|
342
|
+
|
343
|
+
options[:header] = true
|
344
|
+
#chr\tstart\tstop
|
345
|
+
opts.on("-H", "--header", "Set if the file has a line header. Default true") do
|
346
|
+
options[:header] = false
|
347
|
+
end
|
348
|
+
|
349
|
+
options[:input_file] = nil
|
350
|
+
opts.on("-i", "--input_file PATH", "Input file with patient data") do |data|
|
351
|
+
options[:input_file] = data
|
352
|
+
end
|
353
|
+
|
354
|
+
options[:hpo_names] = false
|
355
|
+
opts.on("-n", "--hpo_names", "Define if the input HPO are human readable names. Default false") do
|
356
|
+
options[:hpo_names] = true
|
357
|
+
end
|
358
|
+
|
359
|
+
options[:output_file] = nil
|
360
|
+
opts.on("-o", "--output_file PATH", "Output file with patient data") do |data|
|
361
|
+
options[:output_file] = data
|
362
|
+
end
|
363
|
+
|
364
|
+
options[:hpo_file] = nil
|
365
|
+
opts.on("-P", "--hpo_file PATH", "Input HPO file for extracting HPO codes") do |value|
|
366
|
+
options[:hpo_file] = value
|
367
|
+
end
|
368
|
+
|
369
|
+
options[:hpo_col] = nil
|
370
|
+
opts.on("-p", "--hpo_term_col INTEGER/STRING", "Column name if header true or 0-based position of the column with the HPO terms") do |data|
|
371
|
+
options[:hpo_col] = data
|
372
|
+
end
|
373
|
+
|
374
|
+
options[:hpo_separator] = '|'
|
375
|
+
opts.on("-S", "--hpo_separator STRING", "Set which character must be used to split the HPO profile. Default '|'") do |data|
|
376
|
+
options[:hpo_separator] = data
|
377
|
+
end
|
378
|
+
|
379
|
+
options[:start_col] = nil
|
380
|
+
opts.on("-s", "--start_col INTEGER/STRING", "Column name if header is true, otherwise 0-based position of the column with the start mutation coordinate") do |data|
|
381
|
+
options[:start_col] = data
|
382
|
+
end
|
383
|
+
|
384
|
+
options[:ic_stats] = false
|
385
|
+
opts.on("-t", "--ic_stats", "Use internal IC stats. Default false") do
|
386
|
+
options[:ic_stats] = true
|
387
|
+
end
|
388
|
+
|
389
|
+
opts.on_tail("-h", "--help", "Show this message") do
|
390
|
+
puts opts
|
391
|
+
exit
|
392
|
+
end
|
393
|
+
|
394
|
+
end.parse!
|
395
|
+
|
396
|
+
|
397
|
+
##########################
|
398
|
+
#MAIN
|
399
|
+
##########################
|
400
|
+
output_folder = File.dirname(options[:output_file])
|
401
|
+
detailed_profile_evaluation_file = File.join(output_folder, 'detailed_hpo_profile_evaluation.csv')
|
402
|
+
temp_folder = File.join(output_folder, 'temp')
|
403
|
+
matrix_file = File.join(temp_folder, 'pat_hpo_matrix.txt')
|
404
|
+
clustered_patients_file = File.join(temp_folder, 'cluster_asignation')
|
405
|
+
cluster_ic_data_file = File.join(temp_folder, 'cluster_ic_data.txt')
|
406
|
+
cluster_chromosome_data_file = File.join(temp_folder, 'cluster_chromosome_data.txt')
|
407
|
+
coverage_to_plot_file = File.join(temp_folder, 'coverage_data.txt')
|
408
|
+
sor_coverage_to_plot_file = File.join(temp_folder, 'sor_coverage_data.txt')
|
409
|
+
# cnvs_lenght_to_plot_file = File.join(temp_folder, 'cnvs_lenght.txt')
|
410
|
+
Dir.mkdir(temp_folder) if !File.exists?(temp_folder)
|
411
|
+
|
412
|
+
# LOAD HPO DATA
|
413
|
+
#-------------------------
|
414
|
+
|
415
|
+
# #load hpo dictionaries
|
416
|
+
hpo_black_list = []
|
417
|
+
hpo_black_list = load_hpo_black_list(options[:excluded_hpo]) if !options[:excluded_hpo].nil?
|
418
|
+
hpo_file = ENV['hpo_file']
|
419
|
+
hpo_file = HPO_FILE if hpo_file.nil?
|
420
|
+
hpo_storage = load_hpo_file(hpo_file, hpo_black_list)
|
421
|
+
hpo_parent_child_relations = get_child_parent_relations(hpo_storage)
|
422
|
+
name2code_dictionary = create_hpo_dictionary(hpo_storage) if options[:hpo_names]
|
423
|
+
|
424
|
+
patient_data, $patient_number = load_patient_cohort(options)
|
425
|
+
cohort_hpos, suggested_childs, rejected_hpos = format_patient_data(patient_data, options, name2code_dictionary, hpo_storage, hpo_parent_child_relations)
|
426
|
+
pat_hpo_matrix = generate_patient_hpo_matrix(patient_data, cohort_hpos)
|
427
|
+
write_matrix_for_R(pat_hpo_matrix, cohort_hpos, patient_data.keys, matrix_file)
|
428
|
+
|
429
|
+
system("get_clusters.R #{matrix_file} #{temp_folder}") if !File.exists?(clustered_patients_file)
|
430
|
+
clustered_patients = load_clustered_patients(clustered_patients_file)
|
431
|
+
all_ics, cluster_data_by_chromosomes, top_cluster_phenotypes, multi_chromosome_patients = process_clustered_patients(options, clustered_patients, patient_data)
|
432
|
+
write_cluster_ic_data(all_ics, cluster_ic_data_file, options[:clusters2graph])
|
433
|
+
system("plot_boxplot.R #{cluster_ic_data_file} #{temp_folder} cluster_id ic 'Cluster size/id' 'Information coefficient'")
|
434
|
+
all_hpo_profiles = get_hpo_profile(patient_data)
|
435
|
+
translate_hpo_codes2names(all_hpo_profiles, hpo_storage)
|
436
|
+
summary_stats = get_summary_stats(patient_data, cohort_hpos, all_hpo_profiles)
|
437
|
+
write_detailed_hpo_profile_evaluation(suggested_childs, detailed_profile_evaluation_file, summary_stats)
|
438
|
+
hpo_stats = hpo_stats(all_hpo_profiles)
|
439
|
+
summary_stats << ['Number of unknown phenotypes', rejected_hpos.length]
|
440
|
+
|
441
|
+
all_cnvs_length = []
|
442
|
+
if !options[:chromosome_col].nil?
|
443
|
+
summary_stats << ['Number of clusters with mutations accross > 1 chromosomes', multi_chromosome_patients]
|
444
|
+
write_cluster_chromosome_data(cluster_data_by_chromosomes, cluster_chromosome_data_file, options[:clusters2graph])
|
445
|
+
system("plot_scatterplot.R #{cluster_chromosome_data_file} #{temp_folder} cluster_id chr count 'Cluster size/id' 'Chromosome' 'Patients'")
|
446
|
+
|
447
|
+
#----------------------------------
|
448
|
+
#Prepare data to plot coverage
|
449
|
+
if options[:coverage_analysis]
|
450
|
+
processed_patient_data = process_patient_data(patient_data)
|
451
|
+
cnv_sizes = []
|
452
|
+
processed_patient_data.each do |chr, metadata|
|
453
|
+
metadata.each do |patientID, start, stop|
|
454
|
+
cnv_sizes << stop - start
|
455
|
+
end
|
456
|
+
end
|
457
|
+
cnv_size_average = cnv_sizes.inject{ |sum, el| sum + el }.fdiv(cnv_sizes.length.to_f)
|
458
|
+
patients_by_cluster, sors = generate_cluster_regions(processed_patient_data, 'A', 0)
|
459
|
+
total_patients_sharing_sors = []
|
460
|
+
all_patients = patients_by_cluster.keys
|
461
|
+
all_patients.each do |identifier|
|
462
|
+
total_patients_sharing_sors << identifier.split('_i').first
|
463
|
+
end
|
464
|
+
all_cnvs_length = get_cnvs_length(patient_data)
|
465
|
+
|
466
|
+
###1. Process CNVs
|
467
|
+
raw_coverage, n_cnv, nt, pats_per_region = calculate_coverage(sors)
|
468
|
+
summary_stats << ['Number of genome windows', n_cnv]
|
469
|
+
summary_stats << ['Nucleotides affected by mutations', nt]
|
470
|
+
summary_stats << ['Patient average per region', pats_per_region.round(4)]
|
471
|
+
summary_stats << ['CNV size average', cnv_size_average.round(4)]
|
472
|
+
coverage_to_plot = get_final_coverage(raw_coverage, options[:bin_size])
|
473
|
+
write_coverage_data(coverage_to_plot, coverage_to_plot_file)
|
474
|
+
cmd = "plot_area.R -d #{coverage_to_plot_file} -o #{temp_folder}/coverage_plot -x V2 -y V3 -f V1 -H -m #{CHR_SIZE} -t CNV"
|
475
|
+
system(cmd)
|
476
|
+
|
477
|
+
###2. Process SORs
|
478
|
+
raw_sor_coverage, n_sor, nt, pats_per_region = calculate_coverage(sors, options[:patients_filter] - 1)
|
479
|
+
summary_stats << ["Number of patients with at least 1 SOR", total_patients_sharing_sors.uniq.length]
|
480
|
+
summary_stats << ["Number of SORs with >= #{options[:patients_filter]} patients", n_sor]
|
481
|
+
summary_stats << ['Nucleotides affected by mutations', nt]
|
482
|
+
# summary_stats << ['Patient average per region', pats_per_region]
|
483
|
+
sor_coverage_to_plot = get_final_coverage(raw_sor_coverage, options[:bin_size])
|
484
|
+
write_coverage_data(sor_coverage_to_plot, sor_coverage_to_plot_file)
|
485
|
+
system("plot_area.R -d #{sor_coverage_to_plot_file} -o #{temp_folder}/sor_coverage_plot -x V2 -y V3 -f V1 -H -m #{CHR_SIZE} -t SOR")
|
486
|
+
all_sor_length = get_sor_length_distribution(raw_sor_coverage)
|
487
|
+
end
|
488
|
+
end
|
489
|
+
#----------------------------------
|
490
|
+
#Report
|
491
|
+
total_patients = 0
|
492
|
+
new_cluster_phenotypes = {}
|
493
|
+
phenotypes_frequency = Hash.new(0)
|
494
|
+
top_cluster_phenotypes.each_with_index do |cluster, clusterID|
|
495
|
+
total_patients = cluster.length
|
496
|
+
cluster.each do |phenotypes|
|
497
|
+
phenotypes.each do |p|
|
498
|
+
phenotypes_frequency[p] += 1
|
499
|
+
end
|
500
|
+
end
|
501
|
+
new_cluster_phenotypes[clusterID] = [total_patients, phenotypes_frequency.keys, phenotypes_frequency.values.map{|v| v.fdiv(total_patients) * 100}]
|
502
|
+
phenotypes_frequency = Hash.new(0)
|
503
|
+
end
|
504
|
+
|
505
|
+
container = {
|
506
|
+
:temp_folder => temp_folder,
|
507
|
+
# :top_cluster_phenotypes => top_cluster_phenotypes.length,
|
508
|
+
:summary_stats => summary_stats,
|
509
|
+
:hpo_stats => hpo_stats,
|
510
|
+
:all_cnvs_length => all_cnvs_length,
|
511
|
+
:all_sor_length => all_sor_length,
|
512
|
+
:new_cluster_phenotypes => new_cluster_phenotypes.keys.length
|
513
|
+
}
|
514
|
+
# top_cluster_phenotypes.each_with_index do |cluster, i|
|
515
|
+
# clust_pr = cluster.map{|pr| [pr.join(', ')] }
|
516
|
+
# container["clust_#{i}"] = clust_pr
|
517
|
+
# end
|
518
|
+
|
519
|
+
clust_info = []
|
520
|
+
new_cluster_phenotypes.each do |clusterID, info|
|
521
|
+
phens = info[1].join(', ')
|
522
|
+
freqs = info[2].map{|a| a.round(4)}.join(', ')
|
523
|
+
clust_info << [info[0], phens, freqs]
|
524
|
+
container["clust_#{clusterID}"] = clust_info
|
525
|
+
clust_info = []
|
526
|
+
end
|
527
|
+
|
528
|
+
template = File.open(File.join(REPORT_FOLDER, 'cohort_report.erb')).read
|
529
|
+
report = Report_html.new(container, 'Cohort quality report')
|
530
|
+
report.build(template)
|
531
|
+
report.write(options[:output_file]+'.html')
|