pets 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +11 -0
  3. data/.rspec +3 -0
  4. data/.travis.yml +7 -0
  5. data/Gemfile +4 -0
  6. data/LICENSE.txt +21 -0
  7. data/README.md +41 -0
  8. data/Rakefile +6 -0
  9. data/bin/area_under_curve_pr.rb +118 -0
  10. data/bin/association_metrics_average.rb +94 -0
  11. data/bin/coPatReporter.rb +531 -0
  12. data/bin/console +14 -0
  13. data/bin/fmeasure_index.rb +72 -0
  14. data/bin/get_PR_values.rb +90 -0
  15. data/bin/get_clusters.R +18 -0
  16. data/bin/get_network_nodes.rb +197 -0
  17. data/bin/lines.R +77 -0
  18. data/bin/merge_by_cluster.rb +62 -0
  19. data/bin/merge_pairs.rb +138 -0
  20. data/bin/paco_translator.rb +102 -0
  21. data/bin/phen2reg.rb +385 -0
  22. data/bin/phen2reg_predictor_check.rb +297 -0
  23. data/bin/plot_area.R +71 -0
  24. data/bin/plot_boxplot.R +21 -0
  25. data/bin/plot_density.R +46 -0
  26. data/bin/plot_scatterplot.R +25 -0
  27. data/bin/reg2phen.rb +116 -0
  28. data/bin/region_to_patients_generator.rb +84 -0
  29. data/bin/relate_CI_to_association_value.rb +90 -0
  30. data/bin/setup +8 -0
  31. data/bin/standardize_scores.R +40 -0
  32. data/bin/xyplot_graph.R +60 -0
  33. data/external_data/biosystems_gene.gz +0 -0
  34. data/external_data/bsid2info.gz +0 -0
  35. data/external_data/chromosome_sizes_hg19.txt +24 -0
  36. data/external_data/gene_data.gz +0 -0
  37. data/external_data/gene_data_with_pathways.gz +0 -0
  38. data/external_data/gene_location.gz +0 -0
  39. data/external_data/hp.obo +146363 -0
  40. data/external_data/remove +0 -0
  41. data/lib/pets.rb +6 -0
  42. data/lib/pets/coPatReporterMethods.rb +77 -0
  43. data/lib/pets/generalMethods.rb +556 -0
  44. data/lib/pets/phen2reg_methods.rb +432 -0
  45. data/lib/pets/version.rb +3 -0
  46. data/pets.gemspec +47 -0
  47. data/templates/cohort_report.erb +93 -0
  48. data/templates/patient_report.erb +209 -0
  49. metadata +183 -0
@@ -0,0 +1,62 @@
1
+ #! /usr/bin/env ruby
2
+ #Tool to create the training file, taking as input the cluster_coords.txt file and phenotype_mutations_relations.txt
3
+
4
+ ##########################
5
+ #RUBY GEMS
6
+ ##########################
7
+ require 'optparse'
8
+
9
+ ##########################
10
+ #METHODS
11
+ ##########################
12
+
13
+ def load_cluster_file(cluster_file)
14
+ clusters_info = {}
15
+ File.open(cluster_file).each do |line|
16
+ line.chomp!
17
+ start, stop, chr, node = line.split("\t")
18
+ clusters_info[node] = [chr, start, stop]
19
+ end
20
+ return clusters_info
21
+ end
22
+
23
+ def obtain_training(relations_file, clusters, filter)
24
+ File.open(relations_file).each do |line|
25
+ line.chomp!
26
+ hpo, node, score = line.split("\t")
27
+ next if score.to_f.abs <= filter
28
+ clustersFileInfo = clusters[node]
29
+ puts "#{clustersFileInfo.join("\t")}\t#{hpo}\t#{score}\t#{node}"
30
+ end
31
+ end
32
+
33
+ ##########################
34
+ #OPT-PARSE
35
+ ##########################
36
+ options = {}
37
+ OptionParser.new do |opts|
38
+ opts.banner = "Usage: #{__FILE__} [options]"
39
+
40
+ options[:cluster_file] = nil
41
+ opts.on("-c", "--cluster_file PATH", "Input file with patient clusters") do |cluster_path|
42
+ options[:cluster_file] = cluster_path
43
+ end
44
+
45
+ options[:relations_file] = nil
46
+ opts.on("-n", "--relations_file PATH", "Input relations file from tripartite network") do |relations_file|
47
+ options[:relations_file] = relations_file
48
+ end
49
+
50
+ options[:filter_association] = 0
51
+ opts.on("-f", "--filter_minimun INTEGER", "Filter for association values") do |filter_association|
52
+ options[:filter_association] = filter_association.to_f
53
+ end
54
+
55
+
56
+ end.parse!
57
+
58
+ ##########################
59
+ #MAIN
60
+ ##########################
61
+ clusters = load_cluster_file(options[:cluster_file])
62
+ obtain_training(options[:relations_file], clusters, options[:filter_association])
@@ -0,0 +1,138 @@
1
+ #! /usr/bin/env ruby
2
+
3
+ require 'optparse'
4
+
5
+ #################################
6
+ ## METHODS
7
+ #################################
8
+ def load_pairs(file, key)
9
+ pairsA = {}
10
+ pairsB = {}
11
+ File.open(file).each do |line|
12
+ line.chomp!
13
+ fields = line.split("\t")
14
+ if fields.first =~ /#{key}/#.include?(key)
15
+ save_record(pairsA, fields.last, fields.first )
16
+ else
17
+ save_record(pairsB, fields.last, fields.first )
18
+ end
19
+ end
20
+ return pairsA, pairsB
21
+ end
22
+
23
+ def save_record(hash, key, val)
24
+ query = hash[key]
25
+ if query.nil?
26
+ hash[key] = [val]
27
+ else
28
+ query << val
29
+ end
30
+ end
31
+
32
+ def generate_files(n_files, output)
33
+ files = []
34
+ n_files.times do |n|
35
+ files << File.open("#{output}#{n+1}.txt", 'w')
36
+ end
37
+ return files
38
+ end
39
+
40
+ def connect_pairs_write(pairsA, pairsB, n_files, files)
41
+ pairsA.each do |keyA, valA|
42
+ valB = pairsB[keyA]
43
+ if !valB.nil?
44
+ valA.each do |vA|
45
+ valB.each do |vB|
46
+ files[rand(n_files)].puts "#{vA}\t#{keyA}\t#{vB}"
47
+ end
48
+ end
49
+ end
50
+ end
51
+ end
52
+
53
+ def get_relations(pairsA, pairsB)
54
+ relations = {}
55
+ pairsA.each do |keyA, valA|
56
+ valB = pairsB[keyA]
57
+ if !valB.nil?
58
+ valA.each do |vA|
59
+ valB.each do |vB|
60
+ rel_key = vA + '_' + vB
61
+ query = relations[rel_key]
62
+ if query.nil?
63
+ relations[rel_key] = [keyA]
64
+ else
65
+ query << keyA
66
+ end
67
+ end
68
+ end
69
+ end
70
+ end
71
+ return relations
72
+ end
73
+
74
+ ##############################
75
+ #OPTPARSE
76
+ ##############################
77
+
78
+ options = {}
79
+ OptionParser.new do |opts|
80
+ opts.banner = "Usage: #{__FILE__} [options]"
81
+
82
+ options[:input_file] = nil
83
+ opts.on("-i", "--input_file PATH", "Input file for create adjacency matrix") do |input_file|
84
+ options[:input_file] = input_file
85
+ end
86
+
87
+ options[:key] = ''
88
+ opts.on("-k", "--key STRING", "String to split th two groups") do |key|
89
+ options[:key] = key
90
+ end
91
+
92
+ options[:output] = 'tri_'
93
+ opts.on("-o", "--output PATH", "Output network pairs") do |output|
94
+ options[:output] = output
95
+ end
96
+
97
+ options[:n_files] = 10
98
+ opts.on("-n", "--n_files INTEGER", "Split data onto n files") do |n|
99
+ options[:n_files] = n.to_i
100
+ end
101
+
102
+ options[:min_connections] = 1
103
+ opts.on("-m", "--min_connections INTEGER", "Minimun connections to take into account a relation") do |n|
104
+ options[:min_connections] = n.to_i
105
+ end
106
+
107
+ end.parse!
108
+
109
+ ################################
110
+ ## MAIN
111
+ ################################
112
+ files = generate_files(options[:n_files], options[:output])
113
+
114
+ pairsA, pairsB = load_pairs(options[:input_file], options[:key])
115
+ if options[:min_connections] == 1
116
+ connect_pairs_write(pairsA, pairsB, options[:n_files], files)
117
+ else
118
+ STDERR.puts "MIN. NUMBER OF CONNECTIONS = #{options[:min_connections]}"
119
+ relations = get_relations(pairsA, pairsB)
120
+ count = 0
121
+ discarded = 0
122
+ relations.each do |rel, connections|
123
+ if connections.length >= options[:min_connections]
124
+ fields = rel.split('_')
125
+ connections.each do |con|
126
+ files[rand(options[:n_files])].puts "#{fields.first}\t#{con}\t#{fields.last}"
127
+ end
128
+ else
129
+ discarded += connections.length
130
+ end
131
+ count += connections.length
132
+ end
133
+ STDERR.puts "Relations: #{count}"
134
+ STDERR.puts "Discarded: #{discarded}"
135
+ end
136
+ files.each do |f|
137
+ f.close
138
+ end
@@ -0,0 +1,102 @@
1
+ #! /usr/bin/env ruby
2
+
3
+ ROOT_PATH = File.dirname(__FILE__)
4
+ EXTERNAL_DATA = File.expand_path(File.join(ROOT_PATH, '..', 'external_data'))
5
+ HPO_FILE = File.join(EXTERNAL_DATA, 'hp.obo')
6
+ $: << File.expand_path(File.join(ROOT_PATH, '..', 'lib', 'gephepred'))
7
+
8
+ require 'generalMethods.rb'
9
+ require 'optparse'
10
+
11
+ ###############
12
+ #METHODS
13
+ ###############
14
+
15
+ def translate_codes_to_terms(patient_data, hpo_storage)
16
+ patients_with_hpo_names = {}
17
+ hpo_names = []
18
+ patient_data.each do |patientID, hpos_and_cnvs|
19
+ hpos = hpos_and_cnvs.shift.split('|')
20
+ hpos.each do |hpo|
21
+ hpo_names << hpo_storage[hpo][1]
22
+ end
23
+ hpos_and_cnvs << hpo_names.join('|')
24
+ patients_with_hpo_names[patientID] = hpos_and_cnvs
25
+ hpo_names = []
26
+ end
27
+ return patients_with_hpo_names
28
+ end
29
+
30
+ def save_translated_file(patients_with_hpo_names, output_file)
31
+ handler = File.open(output_file, 'w')
32
+ patients_with_hpo_names.each do |id, data|
33
+ patientID = id.gsub(/_i[0-9]/,'')
34
+ handler.puts "#{patientID}\t#{data.join("\t")}"
35
+ end
36
+ handler.close
37
+ end
38
+
39
+ ###############
40
+ #OPTIONS
41
+ ###############
42
+
43
+ options = {}
44
+ OptionParser.new do |opts|
45
+ opts.banner = "Usage: #{__FILE__} [options]"
46
+
47
+ options[:chromosome_col] = nil
48
+ opts.on("-c", "--chromosome_col INTEGER/STRING", "Column name if header is true, otherwise 0-based position of the column with the chromosome") do |data|
49
+ options[:chromosome_col] = data
50
+ end
51
+
52
+ options[:pat_id_col] = nil
53
+ opts.on("-d", "--pat_id_col INTEGER/STRING", "Column name if header is true, otherwise 0-based position of the column with the patient id") do |data|
54
+ options[:pat_id_col] = data
55
+ end
56
+
57
+ options[:end_col] = nil
58
+ opts.on("-e", "--end_col INTEGER/STRING", "Column name if header is true, otherwise 0-based position of the column with the end mutation coordinate") do |data|
59
+ options[:end_col] = data
60
+ end
61
+
62
+ options[:header] = true
63
+ opts.on("-H", "--header", "Set if the file has a line header. Default true") do
64
+ options[:header] = false
65
+ end
66
+
67
+ options[:output_file] = 'paco_file_with_hpo_names.txt'
68
+ opts.on("-o", "--output_file PATH", "Output paco file with HPO names") do |data|
69
+ options[:output_file] = data
70
+ end
71
+
72
+ options[:input_file] = nil
73
+ opts.on("-P", "--input_file PATH", "Input file with PACO extension") do |value|
74
+ options[:input_file] = value
75
+ end
76
+
77
+ options[:hpo_col] = nil
78
+ opts.on("-p", "--hpo_term_col INTEGER/STRING", "Column name if header true or 0-based position of the column with the HPO terms") do |data|
79
+ options[:hpo_col] = data
80
+ end
81
+
82
+ options[:start_col] = nil
83
+ opts.on("-s", "--start_col INTEGER/STRING", "Column name if header is true, otherwise 0-based position of the column with the start mutation coordinate") do |data|
84
+ options[:start_col] = data
85
+ end
86
+
87
+
88
+ end.parse!
89
+
90
+
91
+ ###############
92
+ #MAIN
93
+ ###############
94
+
95
+ hpo_storage = load_hpo_file(HPO_FILE)
96
+ patient_data, $patient_number = load_patient_cohort(options)
97
+ patients_with_hpo_names = translate_codes_to_terms(patient_data, hpo_storage)
98
+
99
+ save_translated_file(patients_with_hpo_names, options[:output_file])
100
+
101
+
102
+ Process.exit
data/bin/phen2reg.rb ADDED
@@ -0,0 +1,385 @@
1
+ #! /usr/bin/env ruby
2
+ # Rojano E. & Seoane P., September 2016
3
+ # Program to predict the position from given HPO codes, sorted by their association values.
4
+
5
+ REPORT_FOLDER=File.expand_path(File.join(File.dirname(__FILE__), '..', 'templates'))
6
+ ROOT_PATH = File.dirname(__FILE__)
7
+ $: << File.expand_path(File.join(ROOT_PATH, '..', 'lib', 'gephepred'))
8
+ require 'net/ftp'
9
+ require 'net/http'
10
+ require 'zlib'
11
+ require 'json'
12
+ require 'generalMethods.rb'
13
+ require 'phen2reg_methods.rb'
14
+ require 'optparse'
15
+ require 'report_html'
16
+
17
+
18
+ ##########################
19
+ #METHODS
20
+ ##########################
21
+
22
+ def calculate_hpo_recovery_and_filter(adjacent_regions_joined, patient_original_phenotypes, predicted_hpo_percentage, min_hpo_recovery_percentage, patient_number)
23
+ records_to_delete = []
24
+ counter = 0
25
+ adjacent_regions_joined.each do |chr, start, stop, hpo_list, association_values, score|
26
+ hpo_coincidences = patient_original_phenotypes & hpo_list
27
+ original_hpo_recovery_percentage = hpo_coincidences.length / patient_original_phenotypes.length.to_f * 100
28
+ records_to_delete << counter if original_hpo_recovery_percentage < min_hpo_recovery_percentage
29
+ query = predicted_hpo_percentage[patient_number]
30
+ if query.nil?
31
+ predicted_hpo_percentage[patient_number] = [original_hpo_recovery_percentage]
32
+ else
33
+ query << original_hpo_recovery_percentage
34
+ end
35
+ counter += 1
36
+ end
37
+ records_to_delete.reverse_each do |record_number|
38
+ adjacent_regions_joined.delete_at(record_number)
39
+ end
40
+ end
41
+
42
+ def download(ftp_server, path, name)
43
+ ftp = Net::FTP.new()
44
+ ftp.connect(ftp_server)
45
+ ftp.login
46
+ ftp.getbinaryfile(path, name)
47
+ ftp.close
48
+ end
49
+
50
+ ##########################
51
+ #OPT-PARSER
52
+ ##########################
53
+
54
+ options = {}
55
+ OptionParser.new do |opts|
56
+ opts.banner = "Usage: #{__FILE__} [options]"
57
+ options[:best_thresold] = 1.5
58
+ opts.on("-b", "--best_thresold FLOAT", "Association value thresold") do |best_thresold|
59
+ options[:best_thresold] = best_thresold.to_f
60
+ end
61
+
62
+ options[:freedom_degree] = 'prednum'
63
+ opts.on("-d", "--freedom_degree STRING", "Type of freedom degree calculation: prednum, phennum, maxnum") do |fd|
64
+ options[:freedom_degree] = fd
65
+ end
66
+
67
+ options[:html_file] = "patient_profile_report.html"
68
+ opts.on("-F", "--html_file PATH", "HTML file with patient information HPO profile summary") do |html_file|
69
+ options[:html_file] = html_file
70
+ end
71
+
72
+ options[:hpo_file] = nil
73
+ opts.on("-f", "--hpo_file PATH", "Input hp.obo file") do |hpo_file|
74
+ options[:hpo_file] = hpo_file
75
+ end
76
+
77
+ options[:information_coefficient] = nil
78
+ opts.on("-i", "--information_coefficient PATH", "Input file with information coefficients") do |information_coefficient|
79
+ options[:information_coefficient] = information_coefficient
80
+ end
81
+
82
+ options[:retrieve_kegg_data] = false
83
+ opts.on('-k', "--retrieve_kegg_data", "Add KEGG data to prediction report") do
84
+ options[:retrieve_kegg_data] = true
85
+ end
86
+
87
+ options[:print_matrix] = false
88
+ opts.on('-m', "--print_matrix", "Print output matrix") do
89
+ options[:print_matrix] = true
90
+ end
91
+
92
+ options[:max_number] = 10
93
+ opts.on("-M", "--max_number INTEGER", "Max number of regions to take into account") do |max_number|
94
+ options[:max_number] = max_number.to_i
95
+ end
96
+
97
+ options[:hpo_is_name] = false
98
+ opts.on("-n", "--hpo_is_name", "Set this flag if phenotypes are given as names instead of codes") do
99
+ options[:hpo_is_name] = true
100
+ end
101
+
102
+ options[:output_quality_control] = "output_quality_control.txt"
103
+ opts.on("-O", "--output_quality_control PATH", "Output file with quality control of all input HPOs") do |output_quality_control|
104
+ options[:output_quality_control] = output_quality_control
105
+ end
106
+
107
+ options[:output_matrix] = 'output_matrix.txt'
108
+ opts.on("-o", "--output_matrix PATH", "Output matrix file, with associations for each input HPO") do |output_matrix|
109
+ options[:output_matrix] = output_matrix
110
+ end
111
+
112
+ options[:prediction_data] = nil
113
+ #chr\tstart\tstop
114
+ opts.on("-p", "--prediction_file PATH", "Input data with HPO codes for predicting their location. It can be either, a file path or string with HPO separated by pipes (|)") do |input_path|
115
+ options[:prediction_data] = input_path
116
+ end
117
+
118
+ options[:pvalue_cutoff] = 0.1
119
+ opts.on("-P", "--pvalue_cutoff FLOAT", "P-value cutoff") do |pvalue_cutoff|
120
+ options[:pvalue_cutoff] = pvalue_cutoff.to_f
121
+ end
122
+
123
+ options[:quality_control] = true
124
+ opts.on("-Q", "--no_quality_control", "Disable quality control") do
125
+ options[:quality_control] = false
126
+ end
127
+
128
+ options[:ranking_style] = ''
129
+ opts.on("-r", "--ranking_style STRING", "Ranking style: mean, fisher, geommean") do |ranking_style|
130
+ options[:ranking_style] = ranking_style
131
+ end
132
+
133
+ options[:write_hpo_recovery_file] = true
134
+ opts.on("-s", "--write_hpo_recovery_file", "Disable write hpo recovery file") do
135
+ options[:write_hpo_recovery_file] = false
136
+ end
137
+
138
+ options[:group_by_region] = true
139
+ opts.on("-S", "--group_by_region", "Disable prediction which HPOs are located in the same region") do
140
+ options[:group_by_region] = false
141
+ end
142
+
143
+ options[:html_reporting] = true
144
+ opts.on("-T", "--no_html_reporting", "Disable html reporting") do
145
+ options[:html_reporting] = false
146
+ end
147
+
148
+ options[:training_file] = nil
149
+ #chr\tstart\tstop\tphenotype\tassociation_value
150
+ opts.on("-t", "--training_file PATH", "Input training file, with association values") do |training_path|
151
+ options[:training_file] = training_path
152
+ end
153
+
154
+ options[:multiple_profile] = false
155
+ opts.on("-u", "--multiple_profile", "Set if multiple profiles") do
156
+ options[:multiple_profile] = true
157
+ end
158
+
159
+ options[:hpo_recovery] = 50
160
+ opts.on("-y", "--hpo_recovery INTEGER", "Minimum percentage of HPO terms to consider predictions") do |hpo_recovery|
161
+ options[:hpo_recovery] = hpo_recovery.to_f
162
+ end
163
+
164
+ end.parse!
165
+
166
+ ##########################
167
+ #PATHS
168
+ ##########################
169
+ all_paths = {code: File.join(File.dirname(__FILE__), '..')}
170
+ all_paths[:external_data] = File.join(all_paths[:code], 'external_data')
171
+ all_paths[:gene_data] = File.join(all_paths[:external_data], 'gene_data.gz')
172
+ all_paths[:biosystems_gene] = File.join(all_paths[:external_data], 'biosystems_gene.gz')
173
+ all_paths[:biosystems_info] = File.join(all_paths[:external_data], 'bsid2info.gz')
174
+ all_paths[:gene_data_with_pathways] = File.join(all_paths[:external_data], 'gene_data_with_pathways.gz')
175
+ all_paths[:gene_location] = File.join(all_paths[:external_data], 'gene_location.gz')
176
+
177
+ ##########################
178
+ #DOWNLOADS
179
+ ##########################
180
+ sources = [
181
+ ['ftp.ncbi.nlm.nih.gov', 'genomes/H_sapiens/ARCHIVE/ANNOTATION_RELEASE.105/GFF/ref_GRCh37.p13_top_level.gff3.gz', all_paths[:gene_data]],
182
+ ['ftp.ncbi.nlm.nih.gov', 'pub/biosystems/CURRENT/biosystems_gene.gz', all_paths[:biosystems_gene]],
183
+ ['ftp.ncbi.nlm.nih.gov', 'pub/biosystems/CURRENT/bsid2info.gz', all_paths[:biosystems_info]]
184
+ ]
185
+ sources.each do |server, path, output|
186
+ download(server, path, output) if !File.exists?(output)
187
+ end
188
+
189
+ ##########################
190
+ #MAIN
191
+ ##########################
192
+
193
+ if File.exist?(options[:prediction_data])
194
+ if !options[:multiple_profile]
195
+ options[:prediction_data] = [File.open(options[:prediction_data]).readlines.map!{|line| line.chomp}]
196
+ #STDERR.puts options[:prediction_data].inspect
197
+ else
198
+ multiple_profiles = []
199
+ File.open(options[:prediction_data]).each do |line|
200
+ line.chomp!
201
+ multiple_profiles << line.split('|')
202
+ end
203
+ options[:prediction_data] = multiple_profiles
204
+ end
205
+ else
206
+ # if you want to add phenotypes through the terminal
207
+ if !options[:multiple_profile]
208
+ options[:prediction_data] = [options[:prediction_data].split('|')]
209
+ else
210
+ options[:prediction_data] = options[:prediction_data].split('!').map{|profile| profile.split('|')}
211
+ end
212
+ end
213
+
214
+ ##########################
215
+ #- Loading data
216
+
217
+ hpo_storage = load_hpo_file(options[:hpo_file])
218
+ if options[:quality_control]
219
+ hpo_child_metadata = get_child_parent_relations(hpo_storage)
220
+ hpos_ci_values = load_hpo_ci_values(options[:information_coefficient])
221
+ end
222
+
223
+ genes_with_kegg = {}
224
+ gene_location = {}
225
+ if options[:retrieve_kegg_data]
226
+ if !File.exists?(all_paths[:gene_data_with_pathways]) || !File.exists?(all_paths[:gene_location])
227
+ gene_list, gene_location = load_gene_data(all_paths[:gene_data])
228
+ ### kegg_data = parse_kegg_data(genes_found_attributes.keys)
229
+ kegg_data = parse_kegg_from_biosystems(all_paths[:biosystems_gene], all_paths[:biosystems_info])
230
+ genes_with_kegg = merge_genes_with_kegg_data(gene_list, kegg_data)
231
+ write_compressed_plain_file(genes_with_kegg, all_paths[:gene_data_with_pathways])
232
+ write_compressed_plain_file(gene_location, all_paths[:gene_location])
233
+ else
234
+ gene_location = read_compressed_json(all_paths[:gene_location])
235
+ genes_with_kegg = read_compressed_json(all_paths[:gene_data_with_pathways])
236
+ end
237
+ end
238
+
239
+ # hpo_dictionary = load_hpo_dictionary_name2code(options[:hpo2name_file]) if options[:hpo_is_name]
240
+ trainingData = load_training_file4HPO(options[:training_file], options[:best_thresold])
241
+
242
+ ##########################
243
+ #- HPO PROFILE ANALYSIS
244
+
245
+ phenotypes_by_patient = {}
246
+ predicted_hpo_percentage = {}
247
+ options[:prediction_data].each_with_index do |patient_hpo_profile, patient_number|
248
+ phenotypes_by_patient[patient_number] = patient_hpo_profile
249
+ # STDERR.puts patient_hpo_profile.inspect
250
+ if options[:hpo_is_name]
251
+ translated_hpos = []
252
+ hpo_dictionary = create_hpo_dictionary(hpo_storage)
253
+ patient_hpo_profile.each_with_index do |name, i|
254
+ hpo_code = hpo_dictionary[name]
255
+ if hpo_code.nil?
256
+ #STDERR.puts "Warning! Invalid HPO name: #{name}"
257
+ hpo_code = nil
258
+ end
259
+ patient_hpo_profile[i] = hpo_code
260
+ end
261
+ patient_hpo_profile.compact!
262
+ end
263
+
264
+ #HPO quality control
265
+ #---------------------------
266
+ characterised_hpos = []
267
+ #hpo_metadata = []
268
+ if options[:quality_control]
269
+ #characterised_hpos, hpo_metadata = hpo_quality_control(options[:prediction_data], options[:hpo2name_file], options[:information_coefficient])
270
+ # characterised_hpos, hpo_storage = hpo_quality_control(patient_hpo_profile, hpo_storage, hpo_child_metadata, hpos_ci_values)
271
+ characterised_hpos = hpo_quality_control(patient_hpo_profile, hpo_storage, hpo_child_metadata, hpos_ci_values)
272
+ output_quality_control = File.open(options[:output_quality_control], "w")
273
+ header = ["HPO name", "HPO code", "Exists?", "CI value", "Is child of", "Childs"]
274
+ output_quality_control.puts Terminal::Table.new :headings => header, :rows => characterised_hpos
275
+ output_quality_control.close
276
+ end
277
+
278
+ #Prediction steps
279
+ #---------------------------
280
+ hpo_regions = search4HPO(patient_hpo_profile, trainingData)
281
+ if hpo_regions.empty?
282
+ puts "ProfID:#{patient_number}\tResults not found"
283
+ elsif options[:group_by_region] == false
284
+ hpo_regions.each do |hpo, regions|
285
+ regions.each do |region|
286
+ puts "ProfID:#{patient_number}\t#{hpo}\t#{region.join("\t")}"
287
+ end
288
+ end
289
+ elsif options[:group_by_region] == true
290
+ region2hpo, regionAttributes, association_scores = group_by_region(hpo_regions)
291
+ #STDERR.puts patient_hpo_profile.inspect
292
+ #add_parentals_of_not_found_hpos_in_regions(patient_hpo_profile, trainingData, region2hpo, regionAttributes, association_scores, hpo_metadata)
293
+ #STDERR.puts patient_hpo_profile.inspect
294
+ null_value = 0
295
+ hpo_region_matrix = generate_hpo_region_matrix(region2hpo, association_scores, patient_hpo_profile, null_value)
296
+ if options[:print_matrix]
297
+ output_matrix = File.open(options[:output_matrix] + "_#{patient_number}", "w")
298
+ output_matrix.puts "Region\t#{patient_hpo_profile.join("\t")}"
299
+ regionAttributes_array = regionAttributes.values
300
+ hpo_region_matrix.each_with_index do |association_values, i|
301
+ chr, start, stop = regionAttributes_array[i]
302
+ output_matrix.puts "#{chr}:#{start}-#{stop}\t#{association_values.join("\t")}"
303
+ end
304
+ output_matrix.close
305
+ end
306
+
307
+
308
+ scoring_regions(regionAttributes, hpo_region_matrix, options[:ranking_style], options[:pvalue_cutoff], options[:freedom_degree], null_value)
309
+ if regionAttributes.empty?
310
+ puts "ProfID:#{patient_number}\tResults not found"
311
+ else
312
+ adjacent_regions_joined = []
313
+ regionAttributes.each do |regionID, attributes|
314
+ chr, start, stop, patient_ID, region_length, score = attributes
315
+ association_values = association_scores[regionID]
316
+ adjacent_regions_joined << [chr, start, stop, association_values.keys, association_values.values, score]
317
+ end
318
+ adjacent_regions_joined = join_regions(adjacent_regions_joined) # MOVER A ANTES DE CONSTRUIR LA MATRIZ
319
+
320
+ #Ranking
321
+ if options[:ranking_style] == 'fisher'
322
+ adjacent_regions_joined.sort!{|r1, r2| r1.last <=> r2.last}
323
+ else
324
+ adjacent_regions_joined.sort!{|r1, r2| r2.last <=> r1.last}
325
+ end
326
+ patient_original_phenotypes = phenotypes_by_patient[patient_number]
327
+ calculate_hpo_recovery_and_filter(adjacent_regions_joined, patient_original_phenotypes, predicted_hpo_percentage, options[:hpo_recovery], patient_number)
328
+ if adjacent_regions_joined.empty?
329
+ puts "ProfID:#{patient_number}\tResults not found"
330
+ else
331
+ adjacent_regions_joined = adjacent_regions_joined[0..options[:max_number]-1] if !options[:max_number].nil?
332
+ adjacent_regions_joined.each do |chr, start, stop, hpo_list, association_values, score|
333
+ puts "ProfID:#{patient_number}\t#{chr}\t#{start}\t#{stop}\t#{hpo_list.join(',')}\t#{association_values.join(',')}\t#{score}"
334
+ end
335
+ end
336
+ end
337
+ end #elsif
338
+
339
+ pathway_stats = {}
340
+ if options[:retrieve_kegg_data]
341
+ genes_found = []
342
+ genes_found_attributes = {}
343
+ adjacent_regions_joined.each do |adjacent_region|
344
+ ref_chr, ref_start, ref_stop = adjacent_region
345
+ chr_genes = gene_location[ref_chr]
346
+ genes = []
347
+ chr_genes.each do |gene_name, gene_start, gene_stop|
348
+ if (ref_start > gene_start && ref_stop < gene_stop) ||
349
+ (ref_start < gene_start && ref_stop > gene_stop) ||
350
+ (ref_start < gene_start && ref_stop > gene_start) ||
351
+ (ref_start < gene_stop && ref_stop > gene_stop)
352
+ genes << gene_name
353
+ end
354
+ end
355
+ genes_found << genes
356
+ end
357
+
358
+ genes_with_kegg_data = []
359
+ genes_found.each do |genes|
360
+ genes_cluster = []
361
+ genes.each do |gene|
362
+ query = genes_with_kegg[gene]
363
+ genes_cluster << [gene, query]
364
+ end
365
+ genes_with_kegg_data << genes_cluster
366
+ end
367
+ pathway_stats = compute_pathway_enrichment(genes_with_kegg_data, genes_with_kegg)
368
+ pathway_stats.sort!{|p1, p2| p1.last <=> p2.last}
369
+ end
370
+
371
+ #Creating html report
372
+ #-------------------
373
+ ####PLEASE CHECK THIS METHOD!
374
+ report_data(characterised_hpos, adjacent_regions_joined, options[:html_file], hpo_storage, genes_with_kegg_data, pathway_stats) if options[:html_reporting]
375
+ end # end each_with_index
376
+
377
+ if options[:write_hpo_recovery_file]
378
+ handler = File.open('output_profile_recovery', 'w')
379
+ predicted_hpo_percentage.each do |patient, percentage|
380
+ percentage.each do |perc|
381
+ handler.puts "ProfID:#{patient}\t#{perc.inspect}"
382
+ end
383
+ end
384
+ handler.close
385
+ end