pets 0.1.2

Sign up to get free protection for your applications and to get access to all the features.
Files changed (49) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +11 -0
  3. data/.rspec +3 -0
  4. data/.travis.yml +7 -0
  5. data/Gemfile +4 -0
  6. data/LICENSE.txt +21 -0
  7. data/README.md +41 -0
  8. data/Rakefile +6 -0
  9. data/bin/area_under_curve_pr.rb +118 -0
  10. data/bin/association_metrics_average.rb +94 -0
  11. data/bin/coPatReporter.rb +531 -0
  12. data/bin/console +14 -0
  13. data/bin/fmeasure_index.rb +72 -0
  14. data/bin/get_PR_values.rb +90 -0
  15. data/bin/get_clusters.R +18 -0
  16. data/bin/get_network_nodes.rb +197 -0
  17. data/bin/lines.R +77 -0
  18. data/bin/merge_by_cluster.rb +62 -0
  19. data/bin/merge_pairs.rb +138 -0
  20. data/bin/paco_translator.rb +102 -0
  21. data/bin/phen2reg.rb +385 -0
  22. data/bin/phen2reg_predictor_check.rb +297 -0
  23. data/bin/plot_area.R +71 -0
  24. data/bin/plot_boxplot.R +21 -0
  25. data/bin/plot_density.R +46 -0
  26. data/bin/plot_scatterplot.R +25 -0
  27. data/bin/reg2phen.rb +116 -0
  28. data/bin/region_to_patients_generator.rb +84 -0
  29. data/bin/relate_CI_to_association_value.rb +90 -0
  30. data/bin/setup +8 -0
  31. data/bin/standardize_scores.R +40 -0
  32. data/bin/xyplot_graph.R +60 -0
  33. data/external_data/biosystems_gene.gz +0 -0
  34. data/external_data/bsid2info.gz +0 -0
  35. data/external_data/chromosome_sizes_hg19.txt +24 -0
  36. data/external_data/gene_data.gz +0 -0
  37. data/external_data/gene_data_with_pathways.gz +0 -0
  38. data/external_data/gene_location.gz +0 -0
  39. data/external_data/hp.obo +146363 -0
  40. data/external_data/remove +0 -0
  41. data/lib/pets.rb +6 -0
  42. data/lib/pets/coPatReporterMethods.rb +77 -0
  43. data/lib/pets/generalMethods.rb +556 -0
  44. data/lib/pets/phen2reg_methods.rb +432 -0
  45. data/lib/pets/version.rb +3 -0
  46. data/pets.gemspec +47 -0
  47. data/templates/cohort_report.erb +93 -0
  48. data/templates/patient_report.erb +209 -0
  49. metadata +183 -0
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: a629423c3668446b726dc91d984efe15e730ed48
4
+ data.tar.gz: 11b466ef84cdf9d84354fb54f7c5371fee6fe067
5
+ SHA512:
6
+ metadata.gz: c361cd825328b9265851eef94ae704e2bfc4c88c215b193ee10edb04e2caa93d538c66c7701d09b2ae12c018921f8d579a133705a5ab16e342e5cf961abbd370
7
+ data.tar.gz: 8acfa703181a2787f8e50c36b0dd2eb8704b7f3f10f6b45e2a7777e3a7c144182c94ffad57a919efb51c6c22817fff436c1b549f6eee5e7fc5ea07853f6a5512
data/.gitignore ADDED
@@ -0,0 +1,11 @@
1
+ /.bundle/
2
+ /.yardoc
3
+ /_yardoc/
4
+ /coverage/
5
+ /doc/
6
+ /pkg/
7
+ /spec/reports/
8
+ /tmp/
9
+
10
+ # rspec failure tracking
11
+ .rspec_status
data/.rspec ADDED
@@ -0,0 +1,3 @@
1
+ --format documentation
2
+ --color
3
+ --require spec_helper
data/.travis.yml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ sudo: false
3
+ language: ruby
4
+ cache: bundler
5
+ rvm:
6
+ - 2.4.1
7
+ before_install: gem install bundler -v 2.0.1
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source "https://rubygems.org"
2
+
3
+ # Specify your gem's dependencies in pets.gemspec
4
+ gemspec
data/LICENSE.txt ADDED
@@ -0,0 +1,21 @@
1
+ The MIT License (MIT)
2
+
3
+ Copyright (c) 2019 elenarojano
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in
13
+ all copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21
+ THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,41 @@
1
+ # Gephepred
2
+
3
+ Welcome to your new gem! In this directory, you'll find the files you need to be able to package up your Ruby library into a gem. Put your Ruby code in the file `lib/gephepred`. To experiment with that code, run `bin/console` for an interactive prompt.
4
+
5
+ TODO: Delete this and the text above, and describe your gem
6
+
7
+ ## Installation
8
+
9
+ Add this line to your application's Gemfile:
10
+
11
+ ```ruby
12
+ gem 'gephepred'
13
+ ```
14
+
15
+ And then execute:
16
+
17
+ $ bundle
18
+
19
+ Or install it yourself as:
20
+
21
+ $ gem install gephepred
22
+
23
+ ## Usage
24
+
25
+ TODO: Write usage instructions here
26
+
27
+ ## Development
28
+
29
+ After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake spec` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
30
+
31
+ To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release`, which will create a git tag for the version, push git commits and tags, and push the `.gem` file to [rubygems.org](https://rubygems.org).
32
+
33
+ ## Contributing
34
+
35
+ Bug reports and pull requests are welcome on GitHub at https://github.com/[USERNAME]/gephepred.
36
+
37
+
38
+ ## License
39
+
40
+ The gem is available as open source under the terms of the [MIT License](http://opensource.org/licenses/MIT).
41
+
data/Rakefile ADDED
@@ -0,0 +1,6 @@
1
+ require "bundler/gem_tasks"
2
+ require "rspec/core/rake_task"
3
+
4
+ RSpec::Core::RakeTask.new(:spec)
5
+
6
+ task :default => :spec
@@ -0,0 +1,118 @@
1
+ #! /usr/bin/env ruby
2
+
3
+ #Tool for calculate the AUC on PR curves.
4
+
5
+ ##########################
6
+ #LIBRARIES
7
+ ##########################
8
+
9
+ require 'optparse'
10
+
11
+ ##########################
12
+ #METHODS
13
+ ##########################
14
+
15
+ def load_file(input_file, x_val_col, y_val_col)
16
+ coordinates = []
17
+ File.open(input_file).each do |line|
18
+ line.chomp!
19
+ next if line.include?('prec') || line.include?('rec')
20
+ info = line.split("\t")
21
+ x_value = info[x_val_col - 1].to_f
22
+ y_value = info[y_val_col - 1].to_f
23
+ #STDERR.puts y_value
24
+ coordinates << [x_value, y_value]
25
+ end
26
+ return coordinates.sort{|r1, r2| r1[0] <=> r2[0]}
27
+ end
28
+
29
+
30
+ def calculate_auc(pr_values)
31
+ #pr_values = [[x, y], [x', y']...]
32
+ x_val = 0
33
+ y_val = 0
34
+ total_area = 0
35
+ pr_values.each_with_index do |xy_pair, counter|
36
+ if counter != 0
37
+ current_x = xy_pair[0]
38
+ current_y = xy_pair[1]
39
+ #puts x_val
40
+ total_area += (x_val - current_x).abs * current_y
41
+ #STDERR.puts total_square_area
42
+ total_area += (x_val - current_x).abs * (y_val - current_y).abs / 2
43
+ x_val = current_x
44
+ y_val = current_y
45
+ else
46
+ x_val = xy_pair[0]
47
+ y_val = xy_pair[1]
48
+ total_area += x_val * y_val
49
+ #STDERR.puts total_area
50
+ end
51
+ #STDERR.puts total_area
52
+ end
53
+ #STDERR.puts total_area
54
+ return total_area
55
+ end
56
+
57
+ # def calculate_auc(pr_values)
58
+ # #pr_values = [[x, y], [x', y']...]
59
+ # counter = 0
60
+ # x_val = 0
61
+ # y_val = 0
62
+ # total_square_area = 0
63
+ # total_triangle_area = 0
64
+ # pr_values.each do |xy_pair|
65
+ # if counter != 0
66
+ # x_prime = xy_pair[0]
67
+ # y_prime = xy_pair[1]
68
+ # #puts "#{x_prime}\t#{x_val}"
69
+ # #puts "#{y_prime}\t#{y_val}"
70
+ # total_square_area += (x_val - x_prime) * y_prime
71
+ # total_triangle_area += (x_val - x_prime) * (y_prime - y_val) / 2
72
+ # x_val = x_prime
73
+ # y_val = y_prime
74
+ # else
75
+ # x_val = xy_pair[0]
76
+ # y_val = xy_pair[1]
77
+ # counter += 1
78
+ # end
79
+ # end
80
+ # total_area = total_square_area + total_triangle_area
81
+ # STDERR.puts total_area
82
+ # return total_area
83
+ # end
84
+
85
+
86
+ ##########################
87
+ #OPT-PARSE
88
+ ##########################
89
+
90
+ options = {}
91
+ OptionParser.new do |opts|
92
+ opts.banner = "Usage: #{__FILE__} [options]"
93
+
94
+ options[:input_file] = nil
95
+ opts.on("-f", "--input_file PATH", "Precision-recall values file") do |input_file|
96
+ options[:input_file] = input_file
97
+ end
98
+
99
+ options[:x_values] = nil
100
+ opts.on("-x", "--:x_values INTEGER", "Set column for extracting x values") do |x_values|
101
+ options[:x_values] = x_values.to_i
102
+ end
103
+
104
+ options[:y_values] = nil
105
+ opts.on("-y", "--:y_values INTEGER", "Set column for extracting y values") do |y_values|
106
+ options[:y_values] = y_values.to_i
107
+ end
108
+
109
+ end.parse!
110
+
111
+ ##########################
112
+ #MAIN
113
+ ##########################
114
+
115
+ pr_values = load_file(options[:input_file], options[:x_values], options[:y_values])
116
+ #puts pr_values
117
+ final_area = calculate_auc(pr_values)
118
+ puts final_area
@@ -0,0 +1,94 @@
1
+ #! /usr/bin/env ruby
2
+ #Tool for calculating averages between different association values file.
3
+ #File structure: prec rec cut meth
4
+ #Load all files (7) stored in the same directory and calculate average;
5
+ #of lines for each method. Return a file with the same structure;
6
+ #giving name as "average" to the last column
7
+
8
+ require 'optparse'
9
+
10
+ ##########################
11
+ #METHODS
12
+ ##########################
13
+
14
+ def load_association_file(filename)
15
+ fileInfo = []
16
+ header = ''
17
+ line_number = 0
18
+ File.open(filename).each do |line|
19
+ line.chomp!
20
+ if line_number == 0
21
+ header = line
22
+ else
23
+ cut, precision, recall, meth = line.split("\t")
24
+ fileInfo << [cut.to_f, precision.to_f, recall.to_f, meth]
25
+ end
26
+ line_number += 1
27
+ end
28
+ return fileInfo, header
29
+ end
30
+
31
+ def calculate_average(all_files, cols_for_average)
32
+ average = []
33
+ n_files = all_files.length.to_f
34
+ ref_file = all_files.shift
35
+ summatory_file = []
36
+ ref_file.each_with_index do |line, i|
37
+ all_files.each do |file|
38
+ line2 = file[i]
39
+ cols_for_average.each do |col|
40
+ line[col] = line[col] + line2[col]
41
+ end
42
+ end
43
+ summatory_file << line
44
+ end
45
+ summatory_file.each do |line|
46
+ cols_for_average.each do |col|
47
+ line[col] = line[col]/n_files
48
+ end
49
+ average << line
50
+ end
51
+ return average
52
+ end
53
+
54
+
55
+ ##########################
56
+ #OPT-PARSER
57
+ ##########################
58
+
59
+ options = {}
60
+ OptionParser.new do |opts|
61
+ opts.banner = "Usage: #{__FILE__} [options]"
62
+
63
+ options[:file_names] = nil
64
+ opts.on("-f", "--file_names STRING", "Input file names to calculate averages. Please separate names by commas") do |file_names|
65
+ options[:file_names] = file_names.split(',')
66
+ end
67
+
68
+ options[:which_cols] = nil
69
+ opts.on("-c", "--which_cols STRING", "Cols for performing average analysis") do |which_cols|
70
+ options[:which_cols] = which_cols.split(',').map{|i| i.to_i - 1}
71
+ end
72
+
73
+ end.parse!
74
+
75
+ ##########################
76
+ #MAIN
77
+ ##########################
78
+
79
+ all_files = []
80
+ header = nil
81
+ options[:file_names].each do |filename|
82
+ file, header = load_association_file(filename)
83
+ all_files << file
84
+ end
85
+
86
+ average = calculate_average(all_files, options[:which_cols])
87
+
88
+ puts header
89
+ average.each do |line|
90
+ puts line.join("\t")
91
+ end
92
+
93
+
94
+
@@ -0,0 +1,531 @@
1
+ #! /usr/bin/env ruby
2
+
3
+ ROOT_PATH = File.dirname(__FILE__)
4
+ REPORT_FOLDER = File.expand_path(File.join(ROOT_PATH, '..', 'templates'))
5
+ EXTERNAL_DATA = File.expand_path(File.join(ROOT_PATH, '..', 'external_data'))
6
+ HPO_FILE = File.join(EXTERNAL_DATA, 'hp.obo')
7
+ IC_FILE = File.join(EXTERNAL_DATA, 'uniq_hpo_with_CI.txt')
8
+ CHR_SIZE = File.join(EXTERNAL_DATA, 'chromosome_sizes_hg19.txt')
9
+ $: << File.expand_path(File.join(ROOT_PATH, '..', 'lib', 'gephepred'))
10
+
11
+ require 'optparse'
12
+ require 'csv'
13
+ require 'generalMethods.rb'
14
+ require 'coPatReporterMethods.rb'
15
+ require 'report_html'
16
+
17
+ ##########################
18
+ #METHODS
19
+ ##########################
20
+ HPOS = 0
21
+ CHR = 1
22
+ START = 2
23
+ STOP = 3
24
+
25
+ def format_patient_data(patient_data, options, name2code_dictionary, hpo_storage, hpo_parent_child_relations)
26
+ all_hpo = []
27
+ rejected_hpos = []
28
+ suggested_childs = {}
29
+ patient_data.each do |pat_id, patient_record|
30
+ string_hpos, chr, start, stop = patient_record
31
+ hpos = string_hpos.split(options[:hpo_separator])
32
+ translate_hpo_names2codes(hpos, name2code_dictionary, pat_id, rejected_hpos) if options[:hpo_names]
33
+ suggested_childs[pat_id] = check_hpo_codes(hpos, hpo_storage, hpo_parent_child_relations, pat_id, rejected_hpos)
34
+ all_hpo.concat(hpos)
35
+ patient_record[HPOS] = hpos
36
+ patient_record[START] = start.to_i if !start.nil?
37
+ patient_record[STOP] = stop.to_i if !stop.nil?
38
+ end
39
+ return all_hpo.uniq, suggested_childs, rejected_hpos.uniq
40
+ end
41
+
42
+ def translate_hpo_names2codes(hpos, hpo_dictionary, pat_id, rejected_hpos)
43
+ hpo_codes = []
44
+ hpos.each_with_index do |hpo_name, i|
45
+ hpo_code = hpo_dictionary[hpo_name]
46
+ if hpo_code.nil?
47
+ STDERR.puts "WARNING: patient #{pat_id} has the unknown hpo NAME '#{hpo_name}'. Rejected."
48
+ rejected_hpos << hpo_name
49
+ else
50
+ hpo_codes << hpo_code
51
+ end
52
+ end
53
+ hpos.clear
54
+ hpos.concat(hpo_codes)
55
+ end
56
+
57
+ def check_hpo_codes(hpos, hpo_storage, hpo_parent_child_relations, pat_id, rejected_hpos)
58
+ more_specific_hpo = []
59
+ hpos.each_with_index do |hpo_code, i|
60
+ hpo_data = hpo_storage[hpo_code]
61
+ if hpo_data.nil?
62
+ hpos[i] = nil
63
+ STDERR.puts "WARNING: patient #{pat_id} has the unknown hpo CODE '#{hpo_code}'. Rejected."
64
+ rejected_hpos << hpo_code
65
+ else
66
+ main_hpo_code, name = hpo_data
67
+ hpos[i] = main_hpo_code # change from alternate hpo codes to the main ones
68
+ childs = hpo_parent_child_relations[main_hpo_code]
69
+ if childs.nil?
70
+ specific_childs = []
71
+ else
72
+ specific_childs = childs
73
+ end
74
+ more_specific_hpo << [[main_hpo_code, name], specific_childs]
75
+ end
76
+ end
77
+ hpos.compact!
78
+ return more_specific_hpo
79
+ end
80
+
81
+ def generate_patient_hpo_matrix(patient_data, cohort_hpos)
82
+ matrix = []
83
+ n = cohort_hpos.length
84
+ patient_data.each do |pat_id, patient_record|
85
+ pat_hpos = patient_record[HPOS]
86
+ vector = Array.new(n, 0)
87
+ pat_hpos.each do |hpo|
88
+ vector[cohort_hpos.index(hpo)] = 1
89
+ end
90
+ matrix << vector
91
+ end
92
+ return matrix
93
+ end
94
+
95
+ def write_matrix_for_R(matrix, x_names, y_names, file)
96
+ File.open(file, 'w') do |f|
97
+ f.puts x_names.join("\t")
98
+ matrix.each_with_index do |row, i|
99
+ f.puts [y_names[i]].concat(row).join("\t")
100
+ end
101
+ end
102
+ end
103
+
104
+ def process_clustered_patients(options, clustered_patients, patient_data) # get ic and chromosomes
105
+ if options[:ic_stats]
106
+ ic_file = ENV['ic_file']
107
+ ic_file = IC_FILE if ic_file.nil?
108
+ phenotype_ic = load_hpo_ci_values(ic_file)
109
+ else
110
+ phenotype_ic = compute_IC_values(patient_data, $patient_number)
111
+ end
112
+ all_ics = []
113
+ top_cluster_phenotypes = []
114
+ cluster_data_by_chromosomes = []
115
+ multi_chromosome_patients = 0
116
+ processed_clusters = 0
117
+ clustered_patients.sort_by{|cl_id, pat_ids| pat_ids.length }.reverse.each do |cluster_id, patient_ids|
118
+ num_of_patients = patient_ids.length
119
+ next if num_of_patients == 1
120
+ chrs = Hash.new(0)
121
+ all_phens = []
122
+ profile_ics = []
123
+ patient_ids.each do |pat_id|
124
+ patient = patient_data[pat_id]
125
+ phenotypes = patient[HPOS]
126
+ profile_ics << get_profile_ic(phenotypes, phenotype_ic)
127
+ #optional
128
+ all_phens << phenotypes if processed_clusters < options[:clusters2show_detailed_phen_data]
129
+ chrs[patient[CHR]] += 1 if !options[:chromosome_col].nil?
130
+ end
131
+ top_cluster_phenotypes << all_phens if processed_clusters < options[:clusters2show_detailed_phen_data]
132
+ all_ics << profile_ics
133
+ # STDERR.puts [cluster_id, num_of_patients, chr, count].inspect
134
+ if !options[:chromosome_col].nil?
135
+ multi_chromosome_patients += num_of_patients if chrs.length > 1
136
+ chrs.each do |chr, count|
137
+ cluster_data_by_chromosomes << [cluster_id, num_of_patients, chr, count]
138
+ end
139
+ end
140
+ processed_clusters += 1
141
+ end
142
+ # STDERR.puts cluster_data_by_chromosomes.inspect
143
+ return all_ics, cluster_data_by_chromosomes, top_cluster_phenotypes, multi_chromosome_patients
144
+ end
145
+
146
+ def get_profile_ic(hpo_names, phenotype_ic)
147
+ ic = 0
148
+ profile_length = 0
149
+ hpo_names.each do |hpo_id|
150
+ hpo_ic = phenotype_ic[hpo_id]
151
+ # STDERR.puts phenotype_ic.inspect
152
+ ic += hpo_ic if !hpo_ic.nil?
153
+ profile_length += 1
154
+ end
155
+ profile_length = 1 if profile_length == 0
156
+ return ic.fdiv(profile_length)
157
+ end
158
+
159
+ def write_cluster_ic_data(all_ics, cluster_ic_data_file, limit)
160
+ File.open(cluster_ic_data_file, 'w') do |f|
161
+ f.puts %w[cluster_id ic].join("\t")
162
+ all_ics.each_with_index do |cluster_ics, i|
163
+ break if i == limit
164
+ cluster_length = cluster_ics.length
165
+ cluster_ics.each do |clust_ic|
166
+ f.puts "#{cluster_length}_#{i}\t#{clust_ic}"
167
+ end
168
+ end
169
+ end
170
+ end
171
+
172
+ def write_cluster_chromosome_data(cluster_data, cluster_chromosome_data_file, limit)
173
+ File.open(cluster_chromosome_data_file, 'w') do |f|
174
+ f.puts %w[cluster_id chr count].join("\t")
175
+ index = 0
176
+ last_id = cluster_data.first.first unless cluster_data.empty?
177
+ cluster_data.each do |cluster_id, patient_number, chr, count|
178
+ index += 1 if cluster_id != last_id
179
+ break if index == limit
180
+ f.puts ["#{patient_number}_#{index}", chr, count].join("\t")
181
+ last_id = cluster_id
182
+ end
183
+ end
184
+ end
185
+
186
+ def write_coverage_data(coverage_to_plot, coverage_to_plot_file)
187
+ File.open(coverage_to_plot_file, 'w') do |f|
188
+ coverage_to_plot.each do |chr, position, freq|
189
+ f.puts "#{chr}\t#{position}\t#{freq}"
190
+ end
191
+ end
192
+ end
193
+
194
+ def get_hpo_profile(patient_data)
195
+ hpo_profiles = []
196
+ patient_data.each do |pat_id, pat_data|
197
+ hpo_profiles << pat_data[HPOS]
198
+ end
199
+ return hpo_profiles
200
+ end
201
+
202
+ def get_summary_stats(patient_data, cohort_hpos, all_hpo_profiles)
203
+ stats = []
204
+ ids = []
205
+ stats << ['Unique HPOs', cohort_hpos.length]
206
+ patient_ids = patient_data.keys
207
+ patient_ids.each do |pat_id|
208
+ id, count = pat_id.split('_i')
209
+ ids << id
210
+ end
211
+ n_pat = ids.uniq.length
212
+ stats << ['Number of patients in the cohort', n_pat]
213
+ all_hpo_prof_lengths = all_hpo_profiles.map{|p| p.length}.sort
214
+ stats << ['HPOs per patient (average)', all_hpo_prof_lengths.inject(0){|sum, n| sum + n}.fdiv(n_pat).round(4)]
215
+ hpo_pat90 = nil
216
+ rate = 0
217
+ count = 0
218
+ while rate <= 0.1
219
+ hpo_pat90 = all_hpo_prof_lengths[count+1]
220
+ rate = count.fdiv(n_pat)
221
+ count += 1
222
+ end
223
+ stats << ['HPOs for patient in percentile 90', hpo_pat90]
224
+ return stats
225
+ end
226
+
227
+ def hpo_stats(all_hpo_profiles)
228
+ stats = Hash.new(0)
229
+ all_hpo_profiles.each do |profile|
230
+ profile.each do |hpo|
231
+ stats[hpo] += 1
232
+ end
233
+ end
234
+ n_profiles = all_hpo_profiles.length
235
+ hpo_stats = []
236
+ stats.each do |hpo, count|
237
+ hpo_stats << [hpo, count.fdiv(n_profiles)*100]
238
+ end
239
+ hpo_stats.sort!{|h1, h2| h2[1] <=> h1[1]}
240
+ return hpo_stats[0..20]
241
+ end
242
+
243
+ def translate_hpo_codes2names(all_hpo_profiles, hpo_storage)
244
+ all_hpo_profiles.each do |profile|
245
+ profile.each_with_index do |hpo, i|
246
+ hpo_data = hpo_storage[hpo]
247
+ if hpo_data.nil?
248
+ STDERR.puts "WARNING: hpo code '#{hpo}' not exists."
249
+ else
250
+ profile[i] = hpo_data[1]
251
+ end
252
+ end
253
+ end
254
+ end
255
+
256
+ def write_detailed_hpo_profile_evaluation(suggested_childs, detailed_profile_evaluation_file, summary_stats)
257
+ hpo_count = 0
258
+ parent_hpo_count = 0
259
+ CSV.open(detailed_profile_evaluation_file, "wb") do |csv|
260
+ suggested_childs.each do |pat_id, suggestions|
261
+ warning = nil
262
+ warning = 'WARNING: Very few phenotypes' if suggestions.length < 4
263
+ csv << ["PATIENT #{pat_id}", "#{warning}"]
264
+ csv << ["CURRENT PHENOTYPES", "PUTATIVE MORE SPECIFIC PHENOTYPES"]
265
+ suggestions.each do |parent, childs|
266
+ hpo_count += 1
267
+ parent_code, parent_name = parent
268
+ if childs.empty?
269
+ csv << ["#{parent_name} (#{parent_code})", '-']
270
+ else
271
+ parent_hpo_count += 1
272
+ parent_writed = false
273
+ childs.each do |child_code, child_name|
274
+ if !parent_writed
275
+ parent_field = "#{parent_name} (#{parent_code})"
276
+ parent_writed = true
277
+ else
278
+ parent_field = ""
279
+ end
280
+ csv << [parent_field, "#{child_name} (#{child_code})"]
281
+ end
282
+ end
283
+ end
284
+ csv << ["", ""]
285
+ end
286
+ end
287
+ summary_stats << ['Percentage of defined HPOs that have more specific childs', (parent_hpo_count.fdiv(hpo_count) * 100).round(4)]
288
+ end
289
+
290
+ ##########################
291
+ #OPT-PARSER
292
+ ##########################
293
+
294
+ options = {}
295
+ OptionParser.new do |opts|
296
+ opts.banner = "Usage: #{__FILE__} [options]"
297
+
298
+ options[:coverage_analysis] = true
299
+ opts.on("-a", "--coverage_analysis", "Deactivate genome coverage analysis. Default true") do
300
+ options[:coverage_analysis] = false
301
+ end
302
+
303
+ options[:bin_size] = 50000
304
+ opts.on("-b", "--bin_size INTEGER", "Maximum number of bins to plot the coverage") do |data|
305
+ options[:bin_size] = data.to_i
306
+ end
307
+
308
+ options[:clusters2show_detailed_phen_data] = 3
309
+ opts.on("-C", "--clusters2show INTEGER", "How many patient clusters are show in detailed phenotype cluster data section. Default 3") do |data|
310
+ options[:clusters2show_detailed_phen_data] = data.to_i
311
+ end
312
+
313
+ options[:chromosome_col] = nil
314
+ opts.on("-c", "--chromosome_col INTEGER/STRING", "Column name if header is true, otherwise 0-based position of the column with the chromosome") do |data|
315
+ options[:chromosome_col] = data
316
+ end
317
+
318
+ options[:pat_id_col] = nil
319
+ opts.on("-d", "--pat_id_col INTEGER/STRING", "Column name if header is true, otherwise 0-based position of the column with the patient id") do |data|
320
+ options[:pat_id_col] = data
321
+ end
322
+
323
+ options[:excluded_hpo] = nil
324
+ opts.on("-E", "--excluded_hpo PATH", "List of HPO phenotypes to exclude (low informative)") do |excluded_hpo|
325
+ options[:excluded_hpo] = excluded_hpo
326
+ end
327
+
328
+ options[:end_col] = nil
329
+ opts.on("-e", "--end_col INTEGER/STRING", "Column name if header is true, otherwise 0-based position of the column with the end mutation coordinate") do |data|
330
+ options[:end_col] = data
331
+ end
332
+
333
+ options[:patients_filter] = 2
334
+ opts.on("-f", "--patients_filter INTEGER", "Minimum number of patients sharing SORs. Default 0") do |data|
335
+ options[:patients_filter] = data.to_i
336
+ end
337
+
338
+ options[:clusters2graph] = 30
339
+ opts.on("-g", "--clusters2graph INTEGER", "How may patient clusters are plotted in cluster plots. Default 30") do |data|
340
+ options[:clusters2graph] = data.to_i
341
+ end
342
+
343
+ options[:header] = true
344
+ #chr\tstart\tstop
345
+ opts.on("-H", "--header", "Set if the file has a line header. Default true") do
346
+ options[:header] = false
347
+ end
348
+
349
+ options[:input_file] = nil
350
+ opts.on("-i", "--input_file PATH", "Input file with patient data") do |data|
351
+ options[:input_file] = data
352
+ end
353
+
354
+ options[:hpo_names] = false
355
+ opts.on("-n", "--hpo_names", "Define if the input HPO are human readable names. Default false") do
356
+ options[:hpo_names] = true
357
+ end
358
+
359
+ options[:output_file] = nil
360
+ opts.on("-o", "--output_file PATH", "Output file with patient data") do |data|
361
+ options[:output_file] = data
362
+ end
363
+
364
+ options[:hpo_file] = nil
365
+ opts.on("-P", "--hpo_file PATH", "Input HPO file for extracting HPO codes") do |value|
366
+ options[:hpo_file] = value
367
+ end
368
+
369
+ options[:hpo_col] = nil
370
+ opts.on("-p", "--hpo_term_col INTEGER/STRING", "Column name if header true or 0-based position of the column with the HPO terms") do |data|
371
+ options[:hpo_col] = data
372
+ end
373
+
374
+ options[:hpo_separator] = '|'
375
+ opts.on("-S", "--hpo_separator STRING", "Set which character must be used to split the HPO profile. Default '|'") do |data|
376
+ options[:hpo_separator] = data
377
+ end
378
+
379
+ options[:start_col] = nil
380
+ opts.on("-s", "--start_col INTEGER/STRING", "Column name if header is true, otherwise 0-based position of the column with the start mutation coordinate") do |data|
381
+ options[:start_col] = data
382
+ end
383
+
384
+ options[:ic_stats] = false
385
+ opts.on("-t", "--ic_stats", "Use internal IC stats. Default false") do
386
+ options[:ic_stats] = true
387
+ end
388
+
389
+ opts.on_tail("-h", "--help", "Show this message") do
390
+ puts opts
391
+ exit
392
+ end
393
+
394
+ end.parse!
395
+
396
+
397
+ ##########################
398
+ #MAIN
399
+ ##########################
400
+ output_folder = File.dirname(options[:output_file])
401
+ detailed_profile_evaluation_file = File.join(output_folder, 'detailed_hpo_profile_evaluation.csv')
402
+ temp_folder = File.join(output_folder, 'temp')
403
+ matrix_file = File.join(temp_folder, 'pat_hpo_matrix.txt')
404
+ clustered_patients_file = File.join(temp_folder, 'cluster_asignation')
405
+ cluster_ic_data_file = File.join(temp_folder, 'cluster_ic_data.txt')
406
+ cluster_chromosome_data_file = File.join(temp_folder, 'cluster_chromosome_data.txt')
407
+ coverage_to_plot_file = File.join(temp_folder, 'coverage_data.txt')
408
+ sor_coverage_to_plot_file = File.join(temp_folder, 'sor_coverage_data.txt')
409
+ # cnvs_lenght_to_plot_file = File.join(temp_folder, 'cnvs_lenght.txt')
410
+ Dir.mkdir(temp_folder) if !File.exists?(temp_folder)
411
+
412
+ # LOAD HPO DATA
413
+ #-------------------------
414
+
415
+ # #load hpo dictionaries
416
+ hpo_black_list = []
417
+ hpo_black_list = load_hpo_black_list(options[:excluded_hpo]) if !options[:excluded_hpo].nil?
418
+ hpo_file = ENV['hpo_file']
419
+ hpo_file = HPO_FILE if hpo_file.nil?
420
+ hpo_storage = load_hpo_file(hpo_file, hpo_black_list)
421
+ hpo_parent_child_relations = get_child_parent_relations(hpo_storage)
422
+ name2code_dictionary = create_hpo_dictionary(hpo_storage) if options[:hpo_names]
423
+
424
+ patient_data, $patient_number = load_patient_cohort(options)
425
+ cohort_hpos, suggested_childs, rejected_hpos = format_patient_data(patient_data, options, name2code_dictionary, hpo_storage, hpo_parent_child_relations)
426
+ pat_hpo_matrix = generate_patient_hpo_matrix(patient_data, cohort_hpos)
427
+ write_matrix_for_R(pat_hpo_matrix, cohort_hpos, patient_data.keys, matrix_file)
428
+
429
+ system("get_clusters.R #{matrix_file} #{temp_folder}") if !File.exists?(clustered_patients_file)
430
+ clustered_patients = load_clustered_patients(clustered_patients_file)
431
+ all_ics, cluster_data_by_chromosomes, top_cluster_phenotypes, multi_chromosome_patients = process_clustered_patients(options, clustered_patients, patient_data)
432
+ write_cluster_ic_data(all_ics, cluster_ic_data_file, options[:clusters2graph])
433
+ system("plot_boxplot.R #{cluster_ic_data_file} #{temp_folder} cluster_id ic 'Cluster size/id' 'Information coefficient'")
434
+ all_hpo_profiles = get_hpo_profile(patient_data)
435
+ translate_hpo_codes2names(all_hpo_profiles, hpo_storage)
436
+ summary_stats = get_summary_stats(patient_data, cohort_hpos, all_hpo_profiles)
437
+ write_detailed_hpo_profile_evaluation(suggested_childs, detailed_profile_evaluation_file, summary_stats)
438
+ hpo_stats = hpo_stats(all_hpo_profiles)
439
+ summary_stats << ['Number of unknown phenotypes', rejected_hpos.length]
440
+
441
+ all_cnvs_length = []
442
+ if !options[:chromosome_col].nil?
443
+ summary_stats << ['Number of clusters with mutations accross > 1 chromosomes', multi_chromosome_patients]
444
+ write_cluster_chromosome_data(cluster_data_by_chromosomes, cluster_chromosome_data_file, options[:clusters2graph])
445
+ system("plot_scatterplot.R #{cluster_chromosome_data_file} #{temp_folder} cluster_id chr count 'Cluster size/id' 'Chromosome' 'Patients'")
446
+
447
+ #----------------------------------
448
+ #Prepare data to plot coverage
449
+ if options[:coverage_analysis]
450
+ processed_patient_data = process_patient_data(patient_data)
451
+ cnv_sizes = []
452
+ processed_patient_data.each do |chr, metadata|
453
+ metadata.each do |patientID, start, stop|
454
+ cnv_sizes << stop - start
455
+ end
456
+ end
457
+ cnv_size_average = cnv_sizes.inject{ |sum, el| sum + el }.fdiv(cnv_sizes.length.to_f)
458
+ patients_by_cluster, sors = generate_cluster_regions(processed_patient_data, 'A', 0)
459
+ total_patients_sharing_sors = []
460
+ all_patients = patients_by_cluster.keys
461
+ all_patients.each do |identifier|
462
+ total_patients_sharing_sors << identifier.split('_i').first
463
+ end
464
+ all_cnvs_length = get_cnvs_length(patient_data)
465
+
466
+ ###1. Process CNVs
467
+ raw_coverage, n_cnv, nt, pats_per_region = calculate_coverage(sors)
468
+ summary_stats << ['Number of genome windows', n_cnv]
469
+ summary_stats << ['Nucleotides affected by mutations', nt]
470
+ summary_stats << ['Patient average per region', pats_per_region.round(4)]
471
+ summary_stats << ['CNV size average', cnv_size_average.round(4)]
472
+ coverage_to_plot = get_final_coverage(raw_coverage, options[:bin_size])
473
+ write_coverage_data(coverage_to_plot, coverage_to_plot_file)
474
+ cmd = "plot_area.R -d #{coverage_to_plot_file} -o #{temp_folder}/coverage_plot -x V2 -y V3 -f V1 -H -m #{CHR_SIZE} -t CNV"
475
+ system(cmd)
476
+
477
+ ###2. Process SORs
478
+ raw_sor_coverage, n_sor, nt, pats_per_region = calculate_coverage(sors, options[:patients_filter] - 1)
479
+ summary_stats << ["Number of patients with at least 1 SOR", total_patients_sharing_sors.uniq.length]
480
+ summary_stats << ["Number of SORs with >= #{options[:patients_filter]} patients", n_sor]
481
+ summary_stats << ['Nucleotides affected by mutations', nt]
482
+ # summary_stats << ['Patient average per region', pats_per_region]
483
+ sor_coverage_to_plot = get_final_coverage(raw_sor_coverage, options[:bin_size])
484
+ write_coverage_data(sor_coverage_to_plot, sor_coverage_to_plot_file)
485
+ system("plot_area.R -d #{sor_coverage_to_plot_file} -o #{temp_folder}/sor_coverage_plot -x V2 -y V3 -f V1 -H -m #{CHR_SIZE} -t SOR")
486
+ all_sor_length = get_sor_length_distribution(raw_sor_coverage)
487
+ end
488
+ end
489
+ #----------------------------------
490
+ #Report
491
+ total_patients = 0
492
+ new_cluster_phenotypes = {}
493
+ phenotypes_frequency = Hash.new(0)
494
+ top_cluster_phenotypes.each_with_index do |cluster, clusterID|
495
+ total_patients = cluster.length
496
+ cluster.each do |phenotypes|
497
+ phenotypes.each do |p|
498
+ phenotypes_frequency[p] += 1
499
+ end
500
+ end
501
+ new_cluster_phenotypes[clusterID] = [total_patients, phenotypes_frequency.keys, phenotypes_frequency.values.map{|v| v.fdiv(total_patients) * 100}]
502
+ phenotypes_frequency = Hash.new(0)
503
+ end
504
+
505
+ container = {
506
+ :temp_folder => temp_folder,
507
+ # :top_cluster_phenotypes => top_cluster_phenotypes.length,
508
+ :summary_stats => summary_stats,
509
+ :hpo_stats => hpo_stats,
510
+ :all_cnvs_length => all_cnvs_length,
511
+ :all_sor_length => all_sor_length,
512
+ :new_cluster_phenotypes => new_cluster_phenotypes.keys.length
513
+ }
514
+ # top_cluster_phenotypes.each_with_index do |cluster, i|
515
+ # clust_pr = cluster.map{|pr| [pr.join(', ')] }
516
+ # container["clust_#{i}"] = clust_pr
517
+ # end
518
+
519
+ clust_info = []
520
+ new_cluster_phenotypes.each do |clusterID, info|
521
+ phens = info[1].join(', ')
522
+ freqs = info[2].map{|a| a.round(4)}.join(', ')
523
+ clust_info << [info[0], phens, freqs]
524
+ container["clust_#{clusterID}"] = clust_info
525
+ clust_info = []
526
+ end
527
+
528
+ template = File.open(File.join(REPORT_FOLDER, 'cohort_report.erb')).read
529
+ report = Report_html.new(container, 'Cohort quality report')
530
+ report.build(template)
531
+ report.write(options[:output_file]+'.html')