pets 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +11 -0
  3. data/.rspec +3 -0
  4. data/.travis.yml +7 -0
  5. data/Gemfile +4 -0
  6. data/LICENSE.txt +21 -0
  7. data/README.md +41 -0
  8. data/Rakefile +6 -0
  9. data/bin/area_under_curve_pr.rb +118 -0
  10. data/bin/association_metrics_average.rb +94 -0
  11. data/bin/coPatReporter.rb +531 -0
  12. data/bin/console +14 -0
  13. data/bin/fmeasure_index.rb +72 -0
  14. data/bin/get_PR_values.rb +90 -0
  15. data/bin/get_clusters.R +18 -0
  16. data/bin/get_network_nodes.rb +197 -0
  17. data/bin/lines.R +77 -0
  18. data/bin/merge_by_cluster.rb +62 -0
  19. data/bin/merge_pairs.rb +138 -0
  20. data/bin/paco_translator.rb +102 -0
  21. data/bin/phen2reg.rb +385 -0
  22. data/bin/phen2reg_predictor_check.rb +297 -0
  23. data/bin/plot_area.R +71 -0
  24. data/bin/plot_boxplot.R +21 -0
  25. data/bin/plot_density.R +46 -0
  26. data/bin/plot_scatterplot.R +25 -0
  27. data/bin/reg2phen.rb +116 -0
  28. data/bin/region_to_patients_generator.rb +84 -0
  29. data/bin/relate_CI_to_association_value.rb +90 -0
  30. data/bin/setup +8 -0
  31. data/bin/standardize_scores.R +40 -0
  32. data/bin/xyplot_graph.R +60 -0
  33. data/external_data/biosystems_gene.gz +0 -0
  34. data/external_data/bsid2info.gz +0 -0
  35. data/external_data/chromosome_sizes_hg19.txt +24 -0
  36. data/external_data/gene_data.gz +0 -0
  37. data/external_data/gene_data_with_pathways.gz +0 -0
  38. data/external_data/gene_location.gz +0 -0
  39. data/external_data/hp.obo +146363 -0
  40. data/external_data/remove +0 -0
  41. data/lib/pets.rb +6 -0
  42. data/lib/pets/coPatReporterMethods.rb +77 -0
  43. data/lib/pets/generalMethods.rb +556 -0
  44. data/lib/pets/phen2reg_methods.rb +432 -0
  45. data/lib/pets/version.rb +3 -0
  46. data/pets.gemspec +47 -0
  47. data/templates/cohort_report.erb +93 -0
  48. data/templates/patient_report.erb +209 -0
  49. metadata +183 -0
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: a629423c3668446b726dc91d984efe15e730ed48
4
+ data.tar.gz: 11b466ef84cdf9d84354fb54f7c5371fee6fe067
5
+ SHA512:
6
+ metadata.gz: c361cd825328b9265851eef94ae704e2bfc4c88c215b193ee10edb04e2caa93d538c66c7701d09b2ae12c018921f8d579a133705a5ab16e342e5cf961abbd370
7
+ data.tar.gz: 8acfa703181a2787f8e50c36b0dd2eb8704b7f3f10f6b45e2a7777e3a7c144182c94ffad57a919efb51c6c22817fff436c1b549f6eee5e7fc5ea07853f6a5512
data/.gitignore ADDED
@@ -0,0 +1,11 @@
1
+ /.bundle/
2
+ /.yardoc
3
+ /_yardoc/
4
+ /coverage/
5
+ /doc/
6
+ /pkg/
7
+ /spec/reports/
8
+ /tmp/
9
+
10
+ # rspec failure tracking
11
+ .rspec_status
data/.rspec ADDED
@@ -0,0 +1,3 @@
1
+ --format documentation
2
+ --color
3
+ --require spec_helper
data/.travis.yml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ sudo: false
3
+ language: ruby
4
+ cache: bundler
5
+ rvm:
6
+ - 2.4.1
7
+ before_install: gem install bundler -v 2.0.1
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source "https://rubygems.org"
2
+
3
+ # Specify your gem's dependencies in pets.gemspec
4
+ gemspec
data/LICENSE.txt ADDED
@@ -0,0 +1,21 @@
1
+ The MIT License (MIT)
2
+
3
+ Copyright (c) 2019 elenarojano
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in
13
+ all copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21
+ THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,41 @@
1
+ # Gephepred
2
+
3
+ Welcome to your new gem! In this directory, you'll find the files you need to be able to package up your Ruby library into a gem. Put your Ruby code in the file `lib/gephepred`. To experiment with that code, run `bin/console` for an interactive prompt.
4
+
5
+ TODO: Delete this and the text above, and describe your gem
6
+
7
+ ## Installation
8
+
9
+ Add this line to your application's Gemfile:
10
+
11
+ ```ruby
12
+ gem 'gephepred'
13
+ ```
14
+
15
+ And then execute:
16
+
17
+ $ bundle
18
+
19
+ Or install it yourself as:
20
+
21
+ $ gem install gephepred
22
+
23
+ ## Usage
24
+
25
+ TODO: Write usage instructions here
26
+
27
+ ## Development
28
+
29
+ After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake spec` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
30
+
31
+ To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release`, which will create a git tag for the version, push git commits and tags, and push the `.gem` file to [rubygems.org](https://rubygems.org).
32
+
33
+ ## Contributing
34
+
35
+ Bug reports and pull requests are welcome on GitHub at https://github.com/[USERNAME]/gephepred.
36
+
37
+
38
+ ## License
39
+
40
+ The gem is available as open source under the terms of the [MIT License](http://opensource.org/licenses/MIT).
41
+
data/Rakefile ADDED
@@ -0,0 +1,6 @@
1
+ require "bundler/gem_tasks"
2
+ require "rspec/core/rake_task"
3
+
4
+ RSpec::Core::RakeTask.new(:spec)
5
+
6
+ task :default => :spec
@@ -0,0 +1,118 @@
1
+ #! /usr/bin/env ruby
2
+
3
+ #Tool for calculate the AUC on PR curves.
4
+
5
+ ##########################
6
+ #LIBRARIES
7
+ ##########################
8
+
9
+ require 'optparse'
10
+
11
+ ##########################
12
+ #METHODS
13
+ ##########################
14
+
15
+ def load_file(input_file, x_val_col, y_val_col)
16
+ coordinates = []
17
+ File.open(input_file).each do |line|
18
+ line.chomp!
19
+ next if line.include?('prec') || line.include?('rec')
20
+ info = line.split("\t")
21
+ x_value = info[x_val_col - 1].to_f
22
+ y_value = info[y_val_col - 1].to_f
23
+ #STDERR.puts y_value
24
+ coordinates << [x_value, y_value]
25
+ end
26
+ return coordinates.sort{|r1, r2| r1[0] <=> r2[0]}
27
+ end
28
+
29
+
30
+ def calculate_auc(pr_values)
31
+ #pr_values = [[x, y], [x', y']...]
32
+ x_val = 0
33
+ y_val = 0
34
+ total_area = 0
35
+ pr_values.each_with_index do |xy_pair, counter|
36
+ if counter != 0
37
+ current_x = xy_pair[0]
38
+ current_y = xy_pair[1]
39
+ #puts x_val
40
+ total_area += (x_val - current_x).abs * current_y
41
+ #STDERR.puts total_square_area
42
+ total_area += (x_val - current_x).abs * (y_val - current_y).abs / 2
43
+ x_val = current_x
44
+ y_val = current_y
45
+ else
46
+ x_val = xy_pair[0]
47
+ y_val = xy_pair[1]
48
+ total_area += x_val * y_val
49
+ #STDERR.puts total_area
50
+ end
51
+ #STDERR.puts total_area
52
+ end
53
+ #STDERR.puts total_area
54
+ return total_area
55
+ end
56
+
57
+ # def calculate_auc(pr_values)
58
+ # #pr_values = [[x, y], [x', y']...]
59
+ # counter = 0
60
+ # x_val = 0
61
+ # y_val = 0
62
+ # total_square_area = 0
63
+ # total_triangle_area = 0
64
+ # pr_values.each do |xy_pair|
65
+ # if counter != 0
66
+ # x_prime = xy_pair[0]
67
+ # y_prime = xy_pair[1]
68
+ # #puts "#{x_prime}\t#{x_val}"
69
+ # #puts "#{y_prime}\t#{y_val}"
70
+ # total_square_area += (x_val - x_prime) * y_prime
71
+ # total_triangle_area += (x_val - x_prime) * (y_prime - y_val) / 2
72
+ # x_val = x_prime
73
+ # y_val = y_prime
74
+ # else
75
+ # x_val = xy_pair[0]
76
+ # y_val = xy_pair[1]
77
+ # counter += 1
78
+ # end
79
+ # end
80
+ # total_area = total_square_area + total_triangle_area
81
+ # STDERR.puts total_area
82
+ # return total_area
83
+ # end
84
+
85
+
86
+ ##########################
87
+ #OPT-PARSE
88
+ ##########################
89
+
90
+ options = {}
91
+ OptionParser.new do |opts|
92
+ opts.banner = "Usage: #{__FILE__} [options]"
93
+
94
+ options[:input_file] = nil
95
+ opts.on("-f", "--input_file PATH", "Precision-recall values file") do |input_file|
96
+ options[:input_file] = input_file
97
+ end
98
+
99
+ options[:x_values] = nil
100
+ opts.on("-x", "--:x_values INTEGER", "Set column for extracting x values") do |x_values|
101
+ options[:x_values] = x_values.to_i
102
+ end
103
+
104
+ options[:y_values] = nil
105
+ opts.on("-y", "--:y_values INTEGER", "Set column for extracting y values") do |y_values|
106
+ options[:y_values] = y_values.to_i
107
+ end
108
+
109
+ end.parse!
110
+
111
+ ##########################
112
+ #MAIN
113
+ ##########################
114
+
115
+ pr_values = load_file(options[:input_file], options[:x_values], options[:y_values])
116
+ #puts pr_values
117
+ final_area = calculate_auc(pr_values)
118
+ puts final_area
@@ -0,0 +1,94 @@
1
+ #! /usr/bin/env ruby
2
+ #Tool for calculating averages between different association values file.
3
+ #File structure: prec rec cut meth
4
+ #Load all files (7) stored in the same directory and calculate average;
5
+ #of lines for each method. Return a file with the same structure;
6
+ #giving name as "average" to the last column
7
+
8
+ require 'optparse'
9
+
10
+ ##########################
11
+ #METHODS
12
+ ##########################
13
+
14
+ def load_association_file(filename)
15
+ fileInfo = []
16
+ header = ''
17
+ line_number = 0
18
+ File.open(filename).each do |line|
19
+ line.chomp!
20
+ if line_number == 0
21
+ header = line
22
+ else
23
+ cut, precision, recall, meth = line.split("\t")
24
+ fileInfo << [cut.to_f, precision.to_f, recall.to_f, meth]
25
+ end
26
+ line_number += 1
27
+ end
28
+ return fileInfo, header
29
+ end
30
+
31
+ def calculate_average(all_files, cols_for_average)
32
+ average = []
33
+ n_files = all_files.length.to_f
34
+ ref_file = all_files.shift
35
+ summatory_file = []
36
+ ref_file.each_with_index do |line, i|
37
+ all_files.each do |file|
38
+ line2 = file[i]
39
+ cols_for_average.each do |col|
40
+ line[col] = line[col] + line2[col]
41
+ end
42
+ end
43
+ summatory_file << line
44
+ end
45
+ summatory_file.each do |line|
46
+ cols_for_average.each do |col|
47
+ line[col] = line[col]/n_files
48
+ end
49
+ average << line
50
+ end
51
+ return average
52
+ end
53
+
54
+
55
+ ##########################
56
+ #OPT-PARSER
57
+ ##########################
58
+
59
+ options = {}
60
+ OptionParser.new do |opts|
61
+ opts.banner = "Usage: #{__FILE__} [options]"
62
+
63
+ options[:file_names] = nil
64
+ opts.on("-f", "--file_names STRING", "Input file names to calculate averages. Please separate names by commas") do |file_names|
65
+ options[:file_names] = file_names.split(',')
66
+ end
67
+
68
+ options[:which_cols] = nil
69
+ opts.on("-c", "--which_cols STRING", "Cols for performing average analysis") do |which_cols|
70
+ options[:which_cols] = which_cols.split(',').map{|i| i.to_i - 1}
71
+ end
72
+
73
+ end.parse!
74
+
75
+ ##########################
76
+ #MAIN
77
+ ##########################
78
+
79
+ all_files = []
80
+ header = nil
81
+ options[:file_names].each do |filename|
82
+ file, header = load_association_file(filename)
83
+ all_files << file
84
+ end
85
+
86
+ average = calculate_average(all_files, options[:which_cols])
87
+
88
+ puts header
89
+ average.each do |line|
90
+ puts line.join("\t")
91
+ end
92
+
93
+
94
+
@@ -0,0 +1,531 @@
1
+ #! /usr/bin/env ruby
2
+
3
+ ROOT_PATH = File.dirname(__FILE__)
4
+ REPORT_FOLDER = File.expand_path(File.join(ROOT_PATH, '..', 'templates'))
5
+ EXTERNAL_DATA = File.expand_path(File.join(ROOT_PATH, '..', 'external_data'))
6
+ HPO_FILE = File.join(EXTERNAL_DATA, 'hp.obo')
7
+ IC_FILE = File.join(EXTERNAL_DATA, 'uniq_hpo_with_CI.txt')
8
+ CHR_SIZE = File.join(EXTERNAL_DATA, 'chromosome_sizes_hg19.txt')
9
+ $: << File.expand_path(File.join(ROOT_PATH, '..', 'lib', 'gephepred'))
10
+
11
+ require 'optparse'
12
+ require 'csv'
13
+ require 'generalMethods.rb'
14
+ require 'coPatReporterMethods.rb'
15
+ require 'report_html'
16
+
17
+ ##########################
18
+ #METHODS
19
+ ##########################
20
+ HPOS = 0
21
+ CHR = 1
22
+ START = 2
23
+ STOP = 3
24
+
25
+ def format_patient_data(patient_data, options, name2code_dictionary, hpo_storage, hpo_parent_child_relations)
26
+ all_hpo = []
27
+ rejected_hpos = []
28
+ suggested_childs = {}
29
+ patient_data.each do |pat_id, patient_record|
30
+ string_hpos, chr, start, stop = patient_record
31
+ hpos = string_hpos.split(options[:hpo_separator])
32
+ translate_hpo_names2codes(hpos, name2code_dictionary, pat_id, rejected_hpos) if options[:hpo_names]
33
+ suggested_childs[pat_id] = check_hpo_codes(hpos, hpo_storage, hpo_parent_child_relations, pat_id, rejected_hpos)
34
+ all_hpo.concat(hpos)
35
+ patient_record[HPOS] = hpos
36
+ patient_record[START] = start.to_i if !start.nil?
37
+ patient_record[STOP] = stop.to_i if !stop.nil?
38
+ end
39
+ return all_hpo.uniq, suggested_childs, rejected_hpos.uniq
40
+ end
41
+
42
+ def translate_hpo_names2codes(hpos, hpo_dictionary, pat_id, rejected_hpos)
43
+ hpo_codes = []
44
+ hpos.each_with_index do |hpo_name, i|
45
+ hpo_code = hpo_dictionary[hpo_name]
46
+ if hpo_code.nil?
47
+ STDERR.puts "WARNING: patient #{pat_id} has the unknown hpo NAME '#{hpo_name}'. Rejected."
48
+ rejected_hpos << hpo_name
49
+ else
50
+ hpo_codes << hpo_code
51
+ end
52
+ end
53
+ hpos.clear
54
+ hpos.concat(hpo_codes)
55
+ end
56
+
57
+ def check_hpo_codes(hpos, hpo_storage, hpo_parent_child_relations, pat_id, rejected_hpos)
58
+ more_specific_hpo = []
59
+ hpos.each_with_index do |hpo_code, i|
60
+ hpo_data = hpo_storage[hpo_code]
61
+ if hpo_data.nil?
62
+ hpos[i] = nil
63
+ STDERR.puts "WARNING: patient #{pat_id} has the unknown hpo CODE '#{hpo_code}'. Rejected."
64
+ rejected_hpos << hpo_code
65
+ else
66
+ main_hpo_code, name = hpo_data
67
+ hpos[i] = main_hpo_code # change from alternate hpo codes to the main ones
68
+ childs = hpo_parent_child_relations[main_hpo_code]
69
+ if childs.nil?
70
+ specific_childs = []
71
+ else
72
+ specific_childs = childs
73
+ end
74
+ more_specific_hpo << [[main_hpo_code, name], specific_childs]
75
+ end
76
+ end
77
+ hpos.compact!
78
+ return more_specific_hpo
79
+ end
80
+
81
+ def generate_patient_hpo_matrix(patient_data, cohort_hpos)
82
+ matrix = []
83
+ n = cohort_hpos.length
84
+ patient_data.each do |pat_id, patient_record|
85
+ pat_hpos = patient_record[HPOS]
86
+ vector = Array.new(n, 0)
87
+ pat_hpos.each do |hpo|
88
+ vector[cohort_hpos.index(hpo)] = 1
89
+ end
90
+ matrix << vector
91
+ end
92
+ return matrix
93
+ end
94
+
95
+ def write_matrix_for_R(matrix, x_names, y_names, file)
96
+ File.open(file, 'w') do |f|
97
+ f.puts x_names.join("\t")
98
+ matrix.each_with_index do |row, i|
99
+ f.puts [y_names[i]].concat(row).join("\t")
100
+ end
101
+ end
102
+ end
103
+
104
+ def process_clustered_patients(options, clustered_patients, patient_data) # get ic and chromosomes
105
+ if options[:ic_stats]
106
+ ic_file = ENV['ic_file']
107
+ ic_file = IC_FILE if ic_file.nil?
108
+ phenotype_ic = load_hpo_ci_values(ic_file)
109
+ else
110
+ phenotype_ic = compute_IC_values(patient_data, $patient_number)
111
+ end
112
+ all_ics = []
113
+ top_cluster_phenotypes = []
114
+ cluster_data_by_chromosomes = []
115
+ multi_chromosome_patients = 0
116
+ processed_clusters = 0
117
+ clustered_patients.sort_by{|cl_id, pat_ids| pat_ids.length }.reverse.each do |cluster_id, patient_ids|
118
+ num_of_patients = patient_ids.length
119
+ next if num_of_patients == 1
120
+ chrs = Hash.new(0)
121
+ all_phens = []
122
+ profile_ics = []
123
+ patient_ids.each do |pat_id|
124
+ patient = patient_data[pat_id]
125
+ phenotypes = patient[HPOS]
126
+ profile_ics << get_profile_ic(phenotypes, phenotype_ic)
127
+ #optional
128
+ all_phens << phenotypes if processed_clusters < options[:clusters2show_detailed_phen_data]
129
+ chrs[patient[CHR]] += 1 if !options[:chromosome_col].nil?
130
+ end
131
+ top_cluster_phenotypes << all_phens if processed_clusters < options[:clusters2show_detailed_phen_data]
132
+ all_ics << profile_ics
133
+ # STDERR.puts [cluster_id, num_of_patients, chr, count].inspect
134
+ if !options[:chromosome_col].nil?
135
+ multi_chromosome_patients += num_of_patients if chrs.length > 1
136
+ chrs.each do |chr, count|
137
+ cluster_data_by_chromosomes << [cluster_id, num_of_patients, chr, count]
138
+ end
139
+ end
140
+ processed_clusters += 1
141
+ end
142
+ # STDERR.puts cluster_data_by_chromosomes.inspect
143
+ return all_ics, cluster_data_by_chromosomes, top_cluster_phenotypes, multi_chromosome_patients
144
+ end
145
+
146
+ def get_profile_ic(hpo_names, phenotype_ic)
147
+ ic = 0
148
+ profile_length = 0
149
+ hpo_names.each do |hpo_id|
150
+ hpo_ic = phenotype_ic[hpo_id]
151
+ # STDERR.puts phenotype_ic.inspect
152
+ ic += hpo_ic if !hpo_ic.nil?
153
+ profile_length += 1
154
+ end
155
+ profile_length = 1 if profile_length == 0
156
+ return ic.fdiv(profile_length)
157
+ end
158
+
159
+ def write_cluster_ic_data(all_ics, cluster_ic_data_file, limit)
160
+ File.open(cluster_ic_data_file, 'w') do |f|
161
+ f.puts %w[cluster_id ic].join("\t")
162
+ all_ics.each_with_index do |cluster_ics, i|
163
+ break if i == limit
164
+ cluster_length = cluster_ics.length
165
+ cluster_ics.each do |clust_ic|
166
+ f.puts "#{cluster_length}_#{i}\t#{clust_ic}"
167
+ end
168
+ end
169
+ end
170
+ end
171
+
172
+ def write_cluster_chromosome_data(cluster_data, cluster_chromosome_data_file, limit)
173
+ File.open(cluster_chromosome_data_file, 'w') do |f|
174
+ f.puts %w[cluster_id chr count].join("\t")
175
+ index = 0
176
+ last_id = cluster_data.first.first unless cluster_data.empty?
177
+ cluster_data.each do |cluster_id, patient_number, chr, count|
178
+ index += 1 if cluster_id != last_id
179
+ break if index == limit
180
+ f.puts ["#{patient_number}_#{index}", chr, count].join("\t")
181
+ last_id = cluster_id
182
+ end
183
+ end
184
+ end
185
+
186
+ def write_coverage_data(coverage_to_plot, coverage_to_plot_file)
187
+ File.open(coverage_to_plot_file, 'w') do |f|
188
+ coverage_to_plot.each do |chr, position, freq|
189
+ f.puts "#{chr}\t#{position}\t#{freq}"
190
+ end
191
+ end
192
+ end
193
+
194
+ def get_hpo_profile(patient_data)
195
+ hpo_profiles = []
196
+ patient_data.each do |pat_id, pat_data|
197
+ hpo_profiles << pat_data[HPOS]
198
+ end
199
+ return hpo_profiles
200
+ end
201
+
202
+ def get_summary_stats(patient_data, cohort_hpos, all_hpo_profiles)
203
+ stats = []
204
+ ids = []
205
+ stats << ['Unique HPOs', cohort_hpos.length]
206
+ patient_ids = patient_data.keys
207
+ patient_ids.each do |pat_id|
208
+ id, count = pat_id.split('_i')
209
+ ids << id
210
+ end
211
+ n_pat = ids.uniq.length
212
+ stats << ['Number of patients in the cohort', n_pat]
213
+ all_hpo_prof_lengths = all_hpo_profiles.map{|p| p.length}.sort
214
+ stats << ['HPOs per patient (average)', all_hpo_prof_lengths.inject(0){|sum, n| sum + n}.fdiv(n_pat).round(4)]
215
+ hpo_pat90 = nil
216
+ rate = 0
217
+ count = 0
218
+ while rate <= 0.1
219
+ hpo_pat90 = all_hpo_prof_lengths[count+1]
220
+ rate = count.fdiv(n_pat)
221
+ count += 1
222
+ end
223
+ stats << ['HPOs for patient in percentile 90', hpo_pat90]
224
+ return stats
225
+ end
226
+
227
+ def hpo_stats(all_hpo_profiles)
228
+ stats = Hash.new(0)
229
+ all_hpo_profiles.each do |profile|
230
+ profile.each do |hpo|
231
+ stats[hpo] += 1
232
+ end
233
+ end
234
+ n_profiles = all_hpo_profiles.length
235
+ hpo_stats = []
236
+ stats.each do |hpo, count|
237
+ hpo_stats << [hpo, count.fdiv(n_profiles)*100]
238
+ end
239
+ hpo_stats.sort!{|h1, h2| h2[1] <=> h1[1]}
240
+ return hpo_stats[0..20]
241
+ end
242
+
243
+ def translate_hpo_codes2names(all_hpo_profiles, hpo_storage)
244
+ all_hpo_profiles.each do |profile|
245
+ profile.each_with_index do |hpo, i|
246
+ hpo_data = hpo_storage[hpo]
247
+ if hpo_data.nil?
248
+ STDERR.puts "WARNING: hpo code '#{hpo}' not exists."
249
+ else
250
+ profile[i] = hpo_data[1]
251
+ end
252
+ end
253
+ end
254
+ end
255
+
256
+ def write_detailed_hpo_profile_evaluation(suggested_childs, detailed_profile_evaluation_file, summary_stats)
257
+ hpo_count = 0
258
+ parent_hpo_count = 0
259
+ CSV.open(detailed_profile_evaluation_file, "wb") do |csv|
260
+ suggested_childs.each do |pat_id, suggestions|
261
+ warning = nil
262
+ warning = 'WARNING: Very few phenotypes' if suggestions.length < 4
263
+ csv << ["PATIENT #{pat_id}", "#{warning}"]
264
+ csv << ["CURRENT PHENOTYPES", "PUTATIVE MORE SPECIFIC PHENOTYPES"]
265
+ suggestions.each do |parent, childs|
266
+ hpo_count += 1
267
+ parent_code, parent_name = parent
268
+ if childs.empty?
269
+ csv << ["#{parent_name} (#{parent_code})", '-']
270
+ else
271
+ parent_hpo_count += 1
272
+ parent_writed = false
273
+ childs.each do |child_code, child_name|
274
+ if !parent_writed
275
+ parent_field = "#{parent_name} (#{parent_code})"
276
+ parent_writed = true
277
+ else
278
+ parent_field = ""
279
+ end
280
+ csv << [parent_field, "#{child_name} (#{child_code})"]
281
+ end
282
+ end
283
+ end
284
+ csv << ["", ""]
285
+ end
286
+ end
287
+ summary_stats << ['Percentage of defined HPOs that have more specific childs', (parent_hpo_count.fdiv(hpo_count) * 100).round(4)]
288
+ end
289
+
290
+ ##########################
291
+ #OPT-PARSER
292
+ ##########################
293
+
294
+ options = {}
295
+ OptionParser.new do |opts|
296
+ opts.banner = "Usage: #{__FILE__} [options]"
297
+
298
+ options[:coverage_analysis] = true
299
+ opts.on("-a", "--coverage_analysis", "Deactivate genome coverage analysis. Default true") do
300
+ options[:coverage_analysis] = false
301
+ end
302
+
303
+ options[:bin_size] = 50000
304
+ opts.on("-b", "--bin_size INTEGER", "Maximum number of bins to plot the coverage") do |data|
305
+ options[:bin_size] = data.to_i
306
+ end
307
+
308
+ options[:clusters2show_detailed_phen_data] = 3
309
+ opts.on("-C", "--clusters2show INTEGER", "How many patient clusters are show in detailed phenotype cluster data section. Default 3") do |data|
310
+ options[:clusters2show_detailed_phen_data] = data.to_i
311
+ end
312
+
313
+ options[:chromosome_col] = nil
314
+ opts.on("-c", "--chromosome_col INTEGER/STRING", "Column name if header is true, otherwise 0-based position of the column with the chromosome") do |data|
315
+ options[:chromosome_col] = data
316
+ end
317
+
318
+ options[:pat_id_col] = nil
319
+ opts.on("-d", "--pat_id_col INTEGER/STRING", "Column name if header is true, otherwise 0-based position of the column with the patient id") do |data|
320
+ options[:pat_id_col] = data
321
+ end
322
+
323
+ options[:excluded_hpo] = nil
324
+ opts.on("-E", "--excluded_hpo PATH", "List of HPO phenotypes to exclude (low informative)") do |excluded_hpo|
325
+ options[:excluded_hpo] = excluded_hpo
326
+ end
327
+
328
+ options[:end_col] = nil
329
+ opts.on("-e", "--end_col INTEGER/STRING", "Column name if header is true, otherwise 0-based position of the column with the end mutation coordinate") do |data|
330
+ options[:end_col] = data
331
+ end
332
+
333
+ options[:patients_filter] = 2
334
+ opts.on("-f", "--patients_filter INTEGER", "Minimum number of patients sharing SORs. Default 0") do |data|
335
+ options[:patients_filter] = data.to_i
336
+ end
337
+
338
+ options[:clusters2graph] = 30
339
+ opts.on("-g", "--clusters2graph INTEGER", "How may patient clusters are plotted in cluster plots. Default 30") do |data|
340
+ options[:clusters2graph] = data.to_i
341
+ end
342
+
343
+ options[:header] = true
344
+ #chr\tstart\tstop
345
+ opts.on("-H", "--header", "Set if the file has a line header. Default true") do
346
+ options[:header] = false
347
+ end
348
+
349
+ options[:input_file] = nil
350
+ opts.on("-i", "--input_file PATH", "Input file with patient data") do |data|
351
+ options[:input_file] = data
352
+ end
353
+
354
+ options[:hpo_names] = false
355
+ opts.on("-n", "--hpo_names", "Define if the input HPO are human readable names. Default false") do
356
+ options[:hpo_names] = true
357
+ end
358
+
359
+ options[:output_file] = nil
360
+ opts.on("-o", "--output_file PATH", "Output file with patient data") do |data|
361
+ options[:output_file] = data
362
+ end
363
+
364
+ options[:hpo_file] = nil
365
+ opts.on("-P", "--hpo_file PATH", "Input HPO file for extracting HPO codes") do |value|
366
+ options[:hpo_file] = value
367
+ end
368
+
369
+ options[:hpo_col] = nil
370
+ opts.on("-p", "--hpo_term_col INTEGER/STRING", "Column name if header true or 0-based position of the column with the HPO terms") do |data|
371
+ options[:hpo_col] = data
372
+ end
373
+
374
+ options[:hpo_separator] = '|'
375
+ opts.on("-S", "--hpo_separator STRING", "Set which character must be used to split the HPO profile. Default '|'") do |data|
376
+ options[:hpo_separator] = data
377
+ end
378
+
379
+ options[:start_col] = nil
380
+ opts.on("-s", "--start_col INTEGER/STRING", "Column name if header is true, otherwise 0-based position of the column with the start mutation coordinate") do |data|
381
+ options[:start_col] = data
382
+ end
383
+
384
+ options[:ic_stats] = false
385
+ opts.on("-t", "--ic_stats", "Use internal IC stats. Default false") do
386
+ options[:ic_stats] = true
387
+ end
388
+
389
+ opts.on_tail("-h", "--help", "Show this message") do
390
+ puts opts
391
+ exit
392
+ end
393
+
394
+ end.parse!
395
+
396
+
397
+ ##########################
398
+ #MAIN
399
+ ##########################
400
+ output_folder = File.dirname(options[:output_file])
401
+ detailed_profile_evaluation_file = File.join(output_folder, 'detailed_hpo_profile_evaluation.csv')
402
+ temp_folder = File.join(output_folder, 'temp')
403
+ matrix_file = File.join(temp_folder, 'pat_hpo_matrix.txt')
404
+ clustered_patients_file = File.join(temp_folder, 'cluster_asignation')
405
+ cluster_ic_data_file = File.join(temp_folder, 'cluster_ic_data.txt')
406
+ cluster_chromosome_data_file = File.join(temp_folder, 'cluster_chromosome_data.txt')
407
+ coverage_to_plot_file = File.join(temp_folder, 'coverage_data.txt')
408
+ sor_coverage_to_plot_file = File.join(temp_folder, 'sor_coverage_data.txt')
409
+ # cnvs_lenght_to_plot_file = File.join(temp_folder, 'cnvs_lenght.txt')
410
+ Dir.mkdir(temp_folder) if !File.exists?(temp_folder)
411
+
412
+ # LOAD HPO DATA
413
+ #-------------------------
414
+
415
+ # #load hpo dictionaries
416
+ hpo_black_list = []
417
+ hpo_black_list = load_hpo_black_list(options[:excluded_hpo]) if !options[:excluded_hpo].nil?
418
+ hpo_file = ENV['hpo_file']
419
+ hpo_file = HPO_FILE if hpo_file.nil?
420
+ hpo_storage = load_hpo_file(hpo_file, hpo_black_list)
421
+ hpo_parent_child_relations = get_child_parent_relations(hpo_storage)
422
+ name2code_dictionary = create_hpo_dictionary(hpo_storage) if options[:hpo_names]
423
+
424
+ patient_data, $patient_number = load_patient_cohort(options)
425
+ cohort_hpos, suggested_childs, rejected_hpos = format_patient_data(patient_data, options, name2code_dictionary, hpo_storage, hpo_parent_child_relations)
426
+ pat_hpo_matrix = generate_patient_hpo_matrix(patient_data, cohort_hpos)
427
+ write_matrix_for_R(pat_hpo_matrix, cohort_hpos, patient_data.keys, matrix_file)
428
+
429
+ system("get_clusters.R #{matrix_file} #{temp_folder}") if !File.exists?(clustered_patients_file)
430
+ clustered_patients = load_clustered_patients(clustered_patients_file)
431
+ all_ics, cluster_data_by_chromosomes, top_cluster_phenotypes, multi_chromosome_patients = process_clustered_patients(options, clustered_patients, patient_data)
432
+ write_cluster_ic_data(all_ics, cluster_ic_data_file, options[:clusters2graph])
433
+ system("plot_boxplot.R #{cluster_ic_data_file} #{temp_folder} cluster_id ic 'Cluster size/id' 'Information coefficient'")
434
+ all_hpo_profiles = get_hpo_profile(patient_data)
435
+ translate_hpo_codes2names(all_hpo_profiles, hpo_storage)
436
+ summary_stats = get_summary_stats(patient_data, cohort_hpos, all_hpo_profiles)
437
+ write_detailed_hpo_profile_evaluation(suggested_childs, detailed_profile_evaluation_file, summary_stats)
438
+ hpo_stats = hpo_stats(all_hpo_profiles)
439
+ summary_stats << ['Number of unknown phenotypes', rejected_hpos.length]
440
+
441
+ all_cnvs_length = []
442
+ if !options[:chromosome_col].nil?
443
+ summary_stats << ['Number of clusters with mutations accross > 1 chromosomes', multi_chromosome_patients]
444
+ write_cluster_chromosome_data(cluster_data_by_chromosomes, cluster_chromosome_data_file, options[:clusters2graph])
445
+ system("plot_scatterplot.R #{cluster_chromosome_data_file} #{temp_folder} cluster_id chr count 'Cluster size/id' 'Chromosome' 'Patients'")
446
+
447
+ #----------------------------------
448
+ #Prepare data to plot coverage
449
+ if options[:coverage_analysis]
450
+ processed_patient_data = process_patient_data(patient_data)
451
+ cnv_sizes = []
452
+ processed_patient_data.each do |chr, metadata|
453
+ metadata.each do |patientID, start, stop|
454
+ cnv_sizes << stop - start
455
+ end
456
+ end
457
+ cnv_size_average = cnv_sizes.inject{ |sum, el| sum + el }.fdiv(cnv_sizes.length.to_f)
458
+ patients_by_cluster, sors = generate_cluster_regions(processed_patient_data, 'A', 0)
459
+ total_patients_sharing_sors = []
460
+ all_patients = patients_by_cluster.keys
461
+ all_patients.each do |identifier|
462
+ total_patients_sharing_sors << identifier.split('_i').first
463
+ end
464
+ all_cnvs_length = get_cnvs_length(patient_data)
465
+
466
+ ###1. Process CNVs
467
+ raw_coverage, n_cnv, nt, pats_per_region = calculate_coverage(sors)
468
+ summary_stats << ['Number of genome windows', n_cnv]
469
+ summary_stats << ['Nucleotides affected by mutations', nt]
470
+ summary_stats << ['Patient average per region', pats_per_region.round(4)]
471
+ summary_stats << ['CNV size average', cnv_size_average.round(4)]
472
+ coverage_to_plot = get_final_coverage(raw_coverage, options[:bin_size])
473
+ write_coverage_data(coverage_to_plot, coverage_to_plot_file)
474
+ cmd = "plot_area.R -d #{coverage_to_plot_file} -o #{temp_folder}/coverage_plot -x V2 -y V3 -f V1 -H -m #{CHR_SIZE} -t CNV"
475
+ system(cmd)
476
+
477
+ ###2. Process SORs
478
+ raw_sor_coverage, n_sor, nt, pats_per_region = calculate_coverage(sors, options[:patients_filter] - 1)
479
+ summary_stats << ["Number of patients with at least 1 SOR", total_patients_sharing_sors.uniq.length]
480
+ summary_stats << ["Number of SORs with >= #{options[:patients_filter]} patients", n_sor]
481
+ summary_stats << ['Nucleotides affected by mutations', nt]
482
+ # summary_stats << ['Patient average per region', pats_per_region]
483
+ sor_coverage_to_plot = get_final_coverage(raw_sor_coverage, options[:bin_size])
484
+ write_coverage_data(sor_coverage_to_plot, sor_coverage_to_plot_file)
485
+ system("plot_area.R -d #{sor_coverage_to_plot_file} -o #{temp_folder}/sor_coverage_plot -x V2 -y V3 -f V1 -H -m #{CHR_SIZE} -t SOR")
486
+ all_sor_length = get_sor_length_distribution(raw_sor_coverage)
487
+ end
488
+ end
489
+ #----------------------------------
490
+ #Report
491
+ total_patients = 0
492
+ new_cluster_phenotypes = {}
493
+ phenotypes_frequency = Hash.new(0)
494
+ top_cluster_phenotypes.each_with_index do |cluster, clusterID|
495
+ total_patients = cluster.length
496
+ cluster.each do |phenotypes|
497
+ phenotypes.each do |p|
498
+ phenotypes_frequency[p] += 1
499
+ end
500
+ end
501
+ new_cluster_phenotypes[clusterID] = [total_patients, phenotypes_frequency.keys, phenotypes_frequency.values.map{|v| v.fdiv(total_patients) * 100}]
502
+ phenotypes_frequency = Hash.new(0)
503
+ end
504
+
505
+ container = {
506
+ :temp_folder => temp_folder,
507
+ # :top_cluster_phenotypes => top_cluster_phenotypes.length,
508
+ :summary_stats => summary_stats,
509
+ :hpo_stats => hpo_stats,
510
+ :all_cnvs_length => all_cnvs_length,
511
+ :all_sor_length => all_sor_length,
512
+ :new_cluster_phenotypes => new_cluster_phenotypes.keys.length
513
+ }
514
+ # top_cluster_phenotypes.each_with_index do |cluster, i|
515
+ # clust_pr = cluster.map{|pr| [pr.join(', ')] }
516
+ # container["clust_#{i}"] = clust_pr
517
+ # end
518
+
519
+ clust_info = []
520
+ new_cluster_phenotypes.each do |clusterID, info|
521
+ phens = info[1].join(', ')
522
+ freqs = info[2].map{|a| a.round(4)}.join(', ')
523
+ clust_info << [info[0], phens, freqs]
524
+ container["clust_#{clusterID}"] = clust_info
525
+ clust_info = []
526
+ end
527
+
528
+ template = File.open(File.join(REPORT_FOLDER, 'cohort_report.erb')).read
529
+ report = Report_html.new(container, 'Cohort quality report')
530
+ report.build(template)
531
+ report.write(options[:output_file]+'.html')