pets 0.1.2

Sign up to get free protection for your applications and to get access to all the features.
Files changed (49) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +11 -0
  3. data/.rspec +3 -0
  4. data/.travis.yml +7 -0
  5. data/Gemfile +4 -0
  6. data/LICENSE.txt +21 -0
  7. data/README.md +41 -0
  8. data/Rakefile +6 -0
  9. data/bin/area_under_curve_pr.rb +118 -0
  10. data/bin/association_metrics_average.rb +94 -0
  11. data/bin/coPatReporter.rb +531 -0
  12. data/bin/console +14 -0
  13. data/bin/fmeasure_index.rb +72 -0
  14. data/bin/get_PR_values.rb +90 -0
  15. data/bin/get_clusters.R +18 -0
  16. data/bin/get_network_nodes.rb +197 -0
  17. data/bin/lines.R +77 -0
  18. data/bin/merge_by_cluster.rb +62 -0
  19. data/bin/merge_pairs.rb +138 -0
  20. data/bin/paco_translator.rb +102 -0
  21. data/bin/phen2reg.rb +385 -0
  22. data/bin/phen2reg_predictor_check.rb +297 -0
  23. data/bin/plot_area.R +71 -0
  24. data/bin/plot_boxplot.R +21 -0
  25. data/bin/plot_density.R +46 -0
  26. data/bin/plot_scatterplot.R +25 -0
  27. data/bin/reg2phen.rb +116 -0
  28. data/bin/region_to_patients_generator.rb +84 -0
  29. data/bin/relate_CI_to_association_value.rb +90 -0
  30. data/bin/setup +8 -0
  31. data/bin/standardize_scores.R +40 -0
  32. data/bin/xyplot_graph.R +60 -0
  33. data/external_data/biosystems_gene.gz +0 -0
  34. data/external_data/bsid2info.gz +0 -0
  35. data/external_data/chromosome_sizes_hg19.txt +24 -0
  36. data/external_data/gene_data.gz +0 -0
  37. data/external_data/gene_data_with_pathways.gz +0 -0
  38. data/external_data/gene_location.gz +0 -0
  39. data/external_data/hp.obo +146363 -0
  40. data/external_data/remove +0 -0
  41. data/lib/pets.rb +6 -0
  42. data/lib/pets/coPatReporterMethods.rb +77 -0
  43. data/lib/pets/generalMethods.rb +556 -0
  44. data/lib/pets/phen2reg_methods.rb +432 -0
  45. data/lib/pets/version.rb +3 -0
  46. data/pets.gemspec +47 -0
  47. data/templates/cohort_report.erb +93 -0
  48. data/templates/patient_report.erb +209 -0
  49. metadata +183 -0
@@ -0,0 +1,62 @@
1
+ #! /usr/bin/env ruby
2
+ #Tool to create the training file, taking as input the cluster_coords.txt file and phenotype_mutations_relations.txt
3
+
4
+ ##########################
5
+ #RUBY GEMS
6
+ ##########################
7
+ require 'optparse'
8
+
9
+ ##########################
10
+ #METHODS
11
+ ##########################
12
+
13
+ def load_cluster_file(cluster_file)
14
+ clusters_info = {}
15
+ File.open(cluster_file).each do |line|
16
+ line.chomp!
17
+ start, stop, chr, node = line.split("\t")
18
+ clusters_info[node] = [chr, start, stop]
19
+ end
20
+ return clusters_info
21
+ end
22
+
23
+ def obtain_training(relations_file, clusters, filter)
24
+ File.open(relations_file).each do |line|
25
+ line.chomp!
26
+ hpo, node, score = line.split("\t")
27
+ next if score.to_f.abs <= filter
28
+ clustersFileInfo = clusters[node]
29
+ puts "#{clustersFileInfo.join("\t")}\t#{hpo}\t#{score}\t#{node}"
30
+ end
31
+ end
32
+
33
+ ##########################
34
+ #OPT-PARSE
35
+ ##########################
36
+ options = {}
37
+ OptionParser.new do |opts|
38
+ opts.banner = "Usage: #{__FILE__} [options]"
39
+
40
+ options[:cluster_file] = nil
41
+ opts.on("-c", "--cluster_file PATH", "Input file with patient clusters") do |cluster_path|
42
+ options[:cluster_file] = cluster_path
43
+ end
44
+
45
+ options[:relations_file] = nil
46
+ opts.on("-n", "--relations_file PATH", "Input relations file from tripartite network") do |relations_file|
47
+ options[:relations_file] = relations_file
48
+ end
49
+
50
+ options[:filter_association] = 0
51
+ opts.on("-f", "--filter_minimun INTEGER", "Filter for association values") do |filter_association|
52
+ options[:filter_association] = filter_association.to_f
53
+ end
54
+
55
+
56
+ end.parse!
57
+
58
+ ##########################
59
+ #MAIN
60
+ ##########################
61
+ clusters = load_cluster_file(options[:cluster_file])
62
+ obtain_training(options[:relations_file], clusters, options[:filter_association])
@@ -0,0 +1,138 @@
1
+ #! /usr/bin/env ruby
2
+
3
+ require 'optparse'
4
+
5
+ #################################
6
+ ## METHODS
7
+ #################################
8
+ def load_pairs(file, key)
9
+ pairsA = {}
10
+ pairsB = {}
11
+ File.open(file).each do |line|
12
+ line.chomp!
13
+ fields = line.split("\t")
14
+ if fields.first =~ /#{key}/#.include?(key)
15
+ save_record(pairsA, fields.last, fields.first )
16
+ else
17
+ save_record(pairsB, fields.last, fields.first )
18
+ end
19
+ end
20
+ return pairsA, pairsB
21
+ end
22
+
23
+ def save_record(hash, key, val)
24
+ query = hash[key]
25
+ if query.nil?
26
+ hash[key] = [val]
27
+ else
28
+ query << val
29
+ end
30
+ end
31
+
32
+ def generate_files(n_files, output)
33
+ files = []
34
+ n_files.times do |n|
35
+ files << File.open("#{output}#{n+1}.txt", 'w')
36
+ end
37
+ return files
38
+ end
39
+
40
+ def connect_pairs_write(pairsA, pairsB, n_files, files)
41
+ pairsA.each do |keyA, valA|
42
+ valB = pairsB[keyA]
43
+ if !valB.nil?
44
+ valA.each do |vA|
45
+ valB.each do |vB|
46
+ files[rand(n_files)].puts "#{vA}\t#{keyA}\t#{vB}"
47
+ end
48
+ end
49
+ end
50
+ end
51
+ end
52
+
53
+ def get_relations(pairsA, pairsB)
54
+ relations = {}
55
+ pairsA.each do |keyA, valA|
56
+ valB = pairsB[keyA]
57
+ if !valB.nil?
58
+ valA.each do |vA|
59
+ valB.each do |vB|
60
+ rel_key = vA + '_' + vB
61
+ query = relations[rel_key]
62
+ if query.nil?
63
+ relations[rel_key] = [keyA]
64
+ else
65
+ query << keyA
66
+ end
67
+ end
68
+ end
69
+ end
70
+ end
71
+ return relations
72
+ end
73
+
74
+ ##############################
75
+ #OPTPARSE
76
+ ##############################
77
+
78
+ options = {}
79
+ OptionParser.new do |opts|
80
+ opts.banner = "Usage: #{__FILE__} [options]"
81
+
82
+ options[:input_file] = nil
83
+ opts.on("-i", "--input_file PATH", "Input file for create adjacency matrix") do |input_file|
84
+ options[:input_file] = input_file
85
+ end
86
+
87
+ options[:key] = ''
88
+ opts.on("-k", "--key STRING", "String to split th two groups") do |key|
89
+ options[:key] = key
90
+ end
91
+
92
+ options[:output] = 'tri_'
93
+ opts.on("-o", "--output PATH", "Output network pairs") do |output|
94
+ options[:output] = output
95
+ end
96
+
97
+ options[:n_files] = 10
98
+ opts.on("-n", "--n_files INTEGER", "Split data onto n files") do |n|
99
+ options[:n_files] = n.to_i
100
+ end
101
+
102
+ options[:min_connections] = 1
103
+ opts.on("-m", "--min_connections INTEGER", "Minimun connections to take into account a relation") do |n|
104
+ options[:min_connections] = n.to_i
105
+ end
106
+
107
+ end.parse!
108
+
109
+ ################################
110
+ ## MAIN
111
+ ################################
112
+ files = generate_files(options[:n_files], options[:output])
113
+
114
+ pairsA, pairsB = load_pairs(options[:input_file], options[:key])
115
+ if options[:min_connections] == 1
116
+ connect_pairs_write(pairsA, pairsB, options[:n_files], files)
117
+ else
118
+ STDERR.puts "MIN. NUMBER OF CONNECTIONS = #{options[:min_connections]}"
119
+ relations = get_relations(pairsA, pairsB)
120
+ count = 0
121
+ discarded = 0
122
+ relations.each do |rel, connections|
123
+ if connections.length >= options[:min_connections]
124
+ fields = rel.split('_')
125
+ connections.each do |con|
126
+ files[rand(options[:n_files])].puts "#{fields.first}\t#{con}\t#{fields.last}"
127
+ end
128
+ else
129
+ discarded += connections.length
130
+ end
131
+ count += connections.length
132
+ end
133
+ STDERR.puts "Relations: #{count}"
134
+ STDERR.puts "Discarded: #{discarded}"
135
+ end
136
+ files.each do |f|
137
+ f.close
138
+ end
@@ -0,0 +1,102 @@
1
+ #! /usr/bin/env ruby
2
+
3
+ ROOT_PATH = File.dirname(__FILE__)
4
+ EXTERNAL_DATA = File.expand_path(File.join(ROOT_PATH, '..', 'external_data'))
5
+ HPO_FILE = File.join(EXTERNAL_DATA, 'hp.obo')
6
+ $: << File.expand_path(File.join(ROOT_PATH, '..', 'lib', 'gephepred'))
7
+
8
+ require 'generalMethods.rb'
9
+ require 'optparse'
10
+
11
+ ###############
12
+ #METHODS
13
+ ###############
14
+
15
+ def translate_codes_to_terms(patient_data, hpo_storage)
16
+ patients_with_hpo_names = {}
17
+ hpo_names = []
18
+ patient_data.each do |patientID, hpos_and_cnvs|
19
+ hpos = hpos_and_cnvs.shift.split('|')
20
+ hpos.each do |hpo|
21
+ hpo_names << hpo_storage[hpo][1]
22
+ end
23
+ hpos_and_cnvs << hpo_names.join('|')
24
+ patients_with_hpo_names[patientID] = hpos_and_cnvs
25
+ hpo_names = []
26
+ end
27
+ return patients_with_hpo_names
28
+ end
29
+
30
+ def save_translated_file(patients_with_hpo_names, output_file)
31
+ handler = File.open(output_file, 'w')
32
+ patients_with_hpo_names.each do |id, data|
33
+ patientID = id.gsub(/_i[0-9]/,'')
34
+ handler.puts "#{patientID}\t#{data.join("\t")}"
35
+ end
36
+ handler.close
37
+ end
38
+
39
+ ###############
40
+ #OPTIONS
41
+ ###############
42
+
43
+ options = {}
44
+ OptionParser.new do |opts|
45
+ opts.banner = "Usage: #{__FILE__} [options]"
46
+
47
+ options[:chromosome_col] = nil
48
+ opts.on("-c", "--chromosome_col INTEGER/STRING", "Column name if header is true, otherwise 0-based position of the column with the chromosome") do |data|
49
+ options[:chromosome_col] = data
50
+ end
51
+
52
+ options[:pat_id_col] = nil
53
+ opts.on("-d", "--pat_id_col INTEGER/STRING", "Column name if header is true, otherwise 0-based position of the column with the patient id") do |data|
54
+ options[:pat_id_col] = data
55
+ end
56
+
57
+ options[:end_col] = nil
58
+ opts.on("-e", "--end_col INTEGER/STRING", "Column name if header is true, otherwise 0-based position of the column with the end mutation coordinate") do |data|
59
+ options[:end_col] = data
60
+ end
61
+
62
+ options[:header] = true
63
+ opts.on("-H", "--header", "Set if the file has a line header. Default true") do
64
+ options[:header] = false
65
+ end
66
+
67
+ options[:output_file] = 'paco_file_with_hpo_names.txt'
68
+ opts.on("-o", "--output_file PATH", "Output paco file with HPO names") do |data|
69
+ options[:output_file] = data
70
+ end
71
+
72
+ options[:input_file] = nil
73
+ opts.on("-P", "--input_file PATH", "Input file with PACO extension") do |value|
74
+ options[:input_file] = value
75
+ end
76
+
77
+ options[:hpo_col] = nil
78
+ opts.on("-p", "--hpo_term_col INTEGER/STRING", "Column name if header true or 0-based position of the column with the HPO terms") do |data|
79
+ options[:hpo_col] = data
80
+ end
81
+
82
+ options[:start_col] = nil
83
+ opts.on("-s", "--start_col INTEGER/STRING", "Column name if header is true, otherwise 0-based position of the column with the start mutation coordinate") do |data|
84
+ options[:start_col] = data
85
+ end
86
+
87
+
88
+ end.parse!
89
+
90
+
91
+ ###############
92
+ #MAIN
93
+ ###############
94
+
95
+ hpo_storage = load_hpo_file(HPO_FILE)
96
+ patient_data, $patient_number = load_patient_cohort(options)
97
+ patients_with_hpo_names = translate_codes_to_terms(patient_data, hpo_storage)
98
+
99
+ save_translated_file(patients_with_hpo_names, options[:output_file])
100
+
101
+
102
+ Process.exit
data/bin/phen2reg.rb ADDED
@@ -0,0 +1,385 @@
1
+ #! /usr/bin/env ruby
2
+ # Rojano E. & Seoane P., September 2016
3
+ # Program to predict the position from given HPO codes, sorted by their association values.
4
+
5
+ REPORT_FOLDER=File.expand_path(File.join(File.dirname(__FILE__), '..', 'templates'))
6
+ ROOT_PATH = File.dirname(__FILE__)
7
+ $: << File.expand_path(File.join(ROOT_PATH, '..', 'lib', 'gephepred'))
8
+ require 'net/ftp'
9
+ require 'net/http'
10
+ require 'zlib'
11
+ require 'json'
12
+ require 'generalMethods.rb'
13
+ require 'phen2reg_methods.rb'
14
+ require 'optparse'
15
+ require 'report_html'
16
+
17
+
18
+ ##########################
19
+ #METHODS
20
+ ##########################
21
+
22
+ def calculate_hpo_recovery_and_filter(adjacent_regions_joined, patient_original_phenotypes, predicted_hpo_percentage, min_hpo_recovery_percentage, patient_number)
23
+ records_to_delete = []
24
+ counter = 0
25
+ adjacent_regions_joined.each do |chr, start, stop, hpo_list, association_values, score|
26
+ hpo_coincidences = patient_original_phenotypes & hpo_list
27
+ original_hpo_recovery_percentage = hpo_coincidences.length / patient_original_phenotypes.length.to_f * 100
28
+ records_to_delete << counter if original_hpo_recovery_percentage < min_hpo_recovery_percentage
29
+ query = predicted_hpo_percentage[patient_number]
30
+ if query.nil?
31
+ predicted_hpo_percentage[patient_number] = [original_hpo_recovery_percentage]
32
+ else
33
+ query << original_hpo_recovery_percentage
34
+ end
35
+ counter += 1
36
+ end
37
+ records_to_delete.reverse_each do |record_number|
38
+ adjacent_regions_joined.delete_at(record_number)
39
+ end
40
+ end
41
+
42
+ def download(ftp_server, path, name)
43
+ ftp = Net::FTP.new()
44
+ ftp.connect(ftp_server)
45
+ ftp.login
46
+ ftp.getbinaryfile(path, name)
47
+ ftp.close
48
+ end
49
+
50
+ ##########################
51
+ #OPT-PARSER
52
+ ##########################
53
+
54
+ options = {}
55
+ OptionParser.new do |opts|
56
+ opts.banner = "Usage: #{__FILE__} [options]"
57
+ options[:best_thresold] = 1.5
58
+ opts.on("-b", "--best_thresold FLOAT", "Association value thresold") do |best_thresold|
59
+ options[:best_thresold] = best_thresold.to_f
60
+ end
61
+
62
+ options[:freedom_degree] = 'prednum'
63
+ opts.on("-d", "--freedom_degree STRING", "Type of freedom degree calculation: prednum, phennum, maxnum") do |fd|
64
+ options[:freedom_degree] = fd
65
+ end
66
+
67
+ options[:html_file] = "patient_profile_report.html"
68
+ opts.on("-F", "--html_file PATH", "HTML file with patient information HPO profile summary") do |html_file|
69
+ options[:html_file] = html_file
70
+ end
71
+
72
+ options[:hpo_file] = nil
73
+ opts.on("-f", "--hpo_file PATH", "Input hp.obo file") do |hpo_file|
74
+ options[:hpo_file] = hpo_file
75
+ end
76
+
77
+ options[:information_coefficient] = nil
78
+ opts.on("-i", "--information_coefficient PATH", "Input file with information coefficients") do |information_coefficient|
79
+ options[:information_coefficient] = information_coefficient
80
+ end
81
+
82
+ options[:retrieve_kegg_data] = false
83
+ opts.on('-k', "--retrieve_kegg_data", "Add KEGG data to prediction report") do
84
+ options[:retrieve_kegg_data] = true
85
+ end
86
+
87
+ options[:print_matrix] = false
88
+ opts.on('-m', "--print_matrix", "Print output matrix") do
89
+ options[:print_matrix] = true
90
+ end
91
+
92
+ options[:max_number] = 10
93
+ opts.on("-M", "--max_number INTEGER", "Max number of regions to take into account") do |max_number|
94
+ options[:max_number] = max_number.to_i
95
+ end
96
+
97
+ options[:hpo_is_name] = false
98
+ opts.on("-n", "--hpo_is_name", "Set this flag if phenotypes are given as names instead of codes") do
99
+ options[:hpo_is_name] = true
100
+ end
101
+
102
+ options[:output_quality_control] = "output_quality_control.txt"
103
+ opts.on("-O", "--output_quality_control PATH", "Output file with quality control of all input HPOs") do |output_quality_control|
104
+ options[:output_quality_control] = output_quality_control
105
+ end
106
+
107
+ options[:output_matrix] = 'output_matrix.txt'
108
+ opts.on("-o", "--output_matrix PATH", "Output matrix file, with associations for each input HPO") do |output_matrix|
109
+ options[:output_matrix] = output_matrix
110
+ end
111
+
112
+ options[:prediction_data] = nil
113
+ #chr\tstart\tstop
114
+ opts.on("-p", "--prediction_file PATH", "Input data with HPO codes for predicting their location. It can be either, a file path or string with HPO separated by pipes (|)") do |input_path|
115
+ options[:prediction_data] = input_path
116
+ end
117
+
118
+ options[:pvalue_cutoff] = 0.1
119
+ opts.on("-P", "--pvalue_cutoff FLOAT", "P-value cutoff") do |pvalue_cutoff|
120
+ options[:pvalue_cutoff] = pvalue_cutoff.to_f
121
+ end
122
+
123
+ options[:quality_control] = true
124
+ opts.on("-Q", "--no_quality_control", "Disable quality control") do
125
+ options[:quality_control] = false
126
+ end
127
+
128
+ options[:ranking_style] = ''
129
+ opts.on("-r", "--ranking_style STRING", "Ranking style: mean, fisher, geommean") do |ranking_style|
130
+ options[:ranking_style] = ranking_style
131
+ end
132
+
133
+ options[:write_hpo_recovery_file] = true
134
+ opts.on("-s", "--write_hpo_recovery_file", "Disable write hpo recovery file") do
135
+ options[:write_hpo_recovery_file] = false
136
+ end
137
+
138
+ options[:group_by_region] = true
139
+ opts.on("-S", "--group_by_region", "Disable prediction which HPOs are located in the same region") do
140
+ options[:group_by_region] = false
141
+ end
142
+
143
+ options[:html_reporting] = true
144
+ opts.on("-T", "--no_html_reporting", "Disable html reporting") do
145
+ options[:html_reporting] = false
146
+ end
147
+
148
+ options[:training_file] = nil
149
+ #chr\tstart\tstop\tphenotype\tassociation_value
150
+ opts.on("-t", "--training_file PATH", "Input training file, with association values") do |training_path|
151
+ options[:training_file] = training_path
152
+ end
153
+
154
+ options[:multiple_profile] = false
155
+ opts.on("-u", "--multiple_profile", "Set if multiple profiles") do
156
+ options[:multiple_profile] = true
157
+ end
158
+
159
+ options[:hpo_recovery] = 50
160
+ opts.on("-y", "--hpo_recovery INTEGER", "Minimum percentage of HPO terms to consider predictions") do |hpo_recovery|
161
+ options[:hpo_recovery] = hpo_recovery.to_f
162
+ end
163
+
164
+ end.parse!
165
+
166
+ ##########################
167
+ #PATHS
168
+ ##########################
169
+ all_paths = {code: File.join(File.dirname(__FILE__), '..')}
170
+ all_paths[:external_data] = File.join(all_paths[:code], 'external_data')
171
+ all_paths[:gene_data] = File.join(all_paths[:external_data], 'gene_data.gz')
172
+ all_paths[:biosystems_gene] = File.join(all_paths[:external_data], 'biosystems_gene.gz')
173
+ all_paths[:biosystems_info] = File.join(all_paths[:external_data], 'bsid2info.gz')
174
+ all_paths[:gene_data_with_pathways] = File.join(all_paths[:external_data], 'gene_data_with_pathways.gz')
175
+ all_paths[:gene_location] = File.join(all_paths[:external_data], 'gene_location.gz')
176
+
177
+ ##########################
178
+ #DOWNLOADS
179
+ ##########################
180
+ sources = [
181
+ ['ftp.ncbi.nlm.nih.gov', 'genomes/H_sapiens/ARCHIVE/ANNOTATION_RELEASE.105/GFF/ref_GRCh37.p13_top_level.gff3.gz', all_paths[:gene_data]],
182
+ ['ftp.ncbi.nlm.nih.gov', 'pub/biosystems/CURRENT/biosystems_gene.gz', all_paths[:biosystems_gene]],
183
+ ['ftp.ncbi.nlm.nih.gov', 'pub/biosystems/CURRENT/bsid2info.gz', all_paths[:biosystems_info]]
184
+ ]
185
+ sources.each do |server, path, output|
186
+ download(server, path, output) if !File.exists?(output)
187
+ end
188
+
189
+ ##########################
190
+ #MAIN
191
+ ##########################
192
+
193
+ if File.exist?(options[:prediction_data])
194
+ if !options[:multiple_profile]
195
+ options[:prediction_data] = [File.open(options[:prediction_data]).readlines.map!{|line| line.chomp}]
196
+ #STDERR.puts options[:prediction_data].inspect
197
+ else
198
+ multiple_profiles = []
199
+ File.open(options[:prediction_data]).each do |line|
200
+ line.chomp!
201
+ multiple_profiles << line.split('|')
202
+ end
203
+ options[:prediction_data] = multiple_profiles
204
+ end
205
+ else
206
+ # if you want to add phenotypes through the terminal
207
+ if !options[:multiple_profile]
208
+ options[:prediction_data] = [options[:prediction_data].split('|')]
209
+ else
210
+ options[:prediction_data] = options[:prediction_data].split('!').map{|profile| profile.split('|')}
211
+ end
212
+ end
213
+
214
+ ##########################
215
+ #- Loading data
216
+
217
+ hpo_storage = load_hpo_file(options[:hpo_file])
218
+ if options[:quality_control]
219
+ hpo_child_metadata = get_child_parent_relations(hpo_storage)
220
+ hpos_ci_values = load_hpo_ci_values(options[:information_coefficient])
221
+ end
222
+
223
+ genes_with_kegg = {}
224
+ gene_location = {}
225
+ if options[:retrieve_kegg_data]
226
+ if !File.exists?(all_paths[:gene_data_with_pathways]) || !File.exists?(all_paths[:gene_location])
227
+ gene_list, gene_location = load_gene_data(all_paths[:gene_data])
228
+ ### kegg_data = parse_kegg_data(genes_found_attributes.keys)
229
+ kegg_data = parse_kegg_from_biosystems(all_paths[:biosystems_gene], all_paths[:biosystems_info])
230
+ genes_with_kegg = merge_genes_with_kegg_data(gene_list, kegg_data)
231
+ write_compressed_plain_file(genes_with_kegg, all_paths[:gene_data_with_pathways])
232
+ write_compressed_plain_file(gene_location, all_paths[:gene_location])
233
+ else
234
+ gene_location = read_compressed_json(all_paths[:gene_location])
235
+ genes_with_kegg = read_compressed_json(all_paths[:gene_data_with_pathways])
236
+ end
237
+ end
238
+
239
+ # hpo_dictionary = load_hpo_dictionary_name2code(options[:hpo2name_file]) if options[:hpo_is_name]
240
+ trainingData = load_training_file4HPO(options[:training_file], options[:best_thresold])
241
+
242
+ ##########################
243
+ #- HPO PROFILE ANALYSIS
244
+
245
+ phenotypes_by_patient = {}
246
+ predicted_hpo_percentage = {}
247
+ options[:prediction_data].each_with_index do |patient_hpo_profile, patient_number|
248
+ phenotypes_by_patient[patient_number] = patient_hpo_profile
249
+ # STDERR.puts patient_hpo_profile.inspect
250
+ if options[:hpo_is_name]
251
+ translated_hpos = []
252
+ hpo_dictionary = create_hpo_dictionary(hpo_storage)
253
+ patient_hpo_profile.each_with_index do |name, i|
254
+ hpo_code = hpo_dictionary[name]
255
+ if hpo_code.nil?
256
+ #STDERR.puts "Warning! Invalid HPO name: #{name}"
257
+ hpo_code = nil
258
+ end
259
+ patient_hpo_profile[i] = hpo_code
260
+ end
261
+ patient_hpo_profile.compact!
262
+ end
263
+
264
+ #HPO quality control
265
+ #---------------------------
266
+ characterised_hpos = []
267
+ #hpo_metadata = []
268
+ if options[:quality_control]
269
+ #characterised_hpos, hpo_metadata = hpo_quality_control(options[:prediction_data], options[:hpo2name_file], options[:information_coefficient])
270
+ # characterised_hpos, hpo_storage = hpo_quality_control(patient_hpo_profile, hpo_storage, hpo_child_metadata, hpos_ci_values)
271
+ characterised_hpos = hpo_quality_control(patient_hpo_profile, hpo_storage, hpo_child_metadata, hpos_ci_values)
272
+ output_quality_control = File.open(options[:output_quality_control], "w")
273
+ header = ["HPO name", "HPO code", "Exists?", "CI value", "Is child of", "Childs"]
274
+ output_quality_control.puts Terminal::Table.new :headings => header, :rows => characterised_hpos
275
+ output_quality_control.close
276
+ end
277
+
278
+ #Prediction steps
279
+ #---------------------------
280
+ hpo_regions = search4HPO(patient_hpo_profile, trainingData)
281
+ if hpo_regions.empty?
282
+ puts "ProfID:#{patient_number}\tResults not found"
283
+ elsif options[:group_by_region] == false
284
+ hpo_regions.each do |hpo, regions|
285
+ regions.each do |region|
286
+ puts "ProfID:#{patient_number}\t#{hpo}\t#{region.join("\t")}"
287
+ end
288
+ end
289
+ elsif options[:group_by_region] == true
290
+ region2hpo, regionAttributes, association_scores = group_by_region(hpo_regions)
291
+ #STDERR.puts patient_hpo_profile.inspect
292
+ #add_parentals_of_not_found_hpos_in_regions(patient_hpo_profile, trainingData, region2hpo, regionAttributes, association_scores, hpo_metadata)
293
+ #STDERR.puts patient_hpo_profile.inspect
294
+ null_value = 0
295
+ hpo_region_matrix = generate_hpo_region_matrix(region2hpo, association_scores, patient_hpo_profile, null_value)
296
+ if options[:print_matrix]
297
+ output_matrix = File.open(options[:output_matrix] + "_#{patient_number}", "w")
298
+ output_matrix.puts "Region\t#{patient_hpo_profile.join("\t")}"
299
+ regionAttributes_array = regionAttributes.values
300
+ hpo_region_matrix.each_with_index do |association_values, i|
301
+ chr, start, stop = regionAttributes_array[i]
302
+ output_matrix.puts "#{chr}:#{start}-#{stop}\t#{association_values.join("\t")}"
303
+ end
304
+ output_matrix.close
305
+ end
306
+
307
+
308
+ scoring_regions(regionAttributes, hpo_region_matrix, options[:ranking_style], options[:pvalue_cutoff], options[:freedom_degree], null_value)
309
+ if regionAttributes.empty?
310
+ puts "ProfID:#{patient_number}\tResults not found"
311
+ else
312
+ adjacent_regions_joined = []
313
+ regionAttributes.each do |regionID, attributes|
314
+ chr, start, stop, patient_ID, region_length, score = attributes
315
+ association_values = association_scores[regionID]
316
+ adjacent_regions_joined << [chr, start, stop, association_values.keys, association_values.values, score]
317
+ end
318
+ adjacent_regions_joined = join_regions(adjacent_regions_joined) # MOVER A ANTES DE CONSTRUIR LA MATRIZ
319
+
320
+ #Ranking
321
+ if options[:ranking_style] == 'fisher'
322
+ adjacent_regions_joined.sort!{|r1, r2| r1.last <=> r2.last}
323
+ else
324
+ adjacent_regions_joined.sort!{|r1, r2| r2.last <=> r1.last}
325
+ end
326
+ patient_original_phenotypes = phenotypes_by_patient[patient_number]
327
+ calculate_hpo_recovery_and_filter(adjacent_regions_joined, patient_original_phenotypes, predicted_hpo_percentage, options[:hpo_recovery], patient_number)
328
+ if adjacent_regions_joined.empty?
329
+ puts "ProfID:#{patient_number}\tResults not found"
330
+ else
331
+ adjacent_regions_joined = adjacent_regions_joined[0..options[:max_number]-1] if !options[:max_number].nil?
332
+ adjacent_regions_joined.each do |chr, start, stop, hpo_list, association_values, score|
333
+ puts "ProfID:#{patient_number}\t#{chr}\t#{start}\t#{stop}\t#{hpo_list.join(',')}\t#{association_values.join(',')}\t#{score}"
334
+ end
335
+ end
336
+ end
337
+ end #elsif
338
+
339
+ pathway_stats = {}
340
+ if options[:retrieve_kegg_data]
341
+ genes_found = []
342
+ genes_found_attributes = {}
343
+ adjacent_regions_joined.each do |adjacent_region|
344
+ ref_chr, ref_start, ref_stop = adjacent_region
345
+ chr_genes = gene_location[ref_chr]
346
+ genes = []
347
+ chr_genes.each do |gene_name, gene_start, gene_stop|
348
+ if (ref_start > gene_start && ref_stop < gene_stop) ||
349
+ (ref_start < gene_start && ref_stop > gene_stop) ||
350
+ (ref_start < gene_start && ref_stop > gene_start) ||
351
+ (ref_start < gene_stop && ref_stop > gene_stop)
352
+ genes << gene_name
353
+ end
354
+ end
355
+ genes_found << genes
356
+ end
357
+
358
+ genes_with_kegg_data = []
359
+ genes_found.each do |genes|
360
+ genes_cluster = []
361
+ genes.each do |gene|
362
+ query = genes_with_kegg[gene]
363
+ genes_cluster << [gene, query]
364
+ end
365
+ genes_with_kegg_data << genes_cluster
366
+ end
367
+ pathway_stats = compute_pathway_enrichment(genes_with_kegg_data, genes_with_kegg)
368
+ pathway_stats.sort!{|p1, p2| p1.last <=> p2.last}
369
+ end
370
+
371
+ #Creating html report
372
+ #-------------------
373
+ ####PLEASE CHECK THIS METHOD!
374
+ report_data(characterised_hpos, adjacent_regions_joined, options[:html_file], hpo_storage, genes_with_kegg_data, pathway_stats) if options[:html_reporting]
375
+ end # end each_with_index
376
+
377
+ if options[:write_hpo_recovery_file]
378
+ handler = File.open('output_profile_recovery', 'w')
379
+ predicted_hpo_percentage.each do |patient, percentage|
380
+ percentage.each do |perc|
381
+ handler.puts "ProfID:#{patient}\t#{perc.inspect}"
382
+ end
383
+ end
384
+ handler.close
385
+ end