pets 0.2.3 → 0.2.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile +2 -0
- data/README.md +79 -5
- data/bin/coPatReporter.rb +68 -156
- data/bin/comPatMondo.rb +1 -4
- data/bin/evidence_profiler.rb +102 -150
- data/bin/get_gen_features.rb +146 -0
- data/bin/get_network_nodes.rb +79 -132
- data/bin/get_sorted_profs.rb +25 -36
- data/bin/install_deps.rb +8 -0
- data/bin/paco_translator.rb +29 -72
- data/bin/phen2reg.rb +1 -4
- data/bin/profiles2phenopacket.rb +86 -0
- data/bin/reg2phen.rb +1 -3
- data/example_datasets/associations_file.txt +757 -0
- data/example_datasets/example_patient.txt +6 -0
- data/example_datasets/example_patient_hpos.txt +15 -0
- data/example_datasets/genes.txt +8 -0
- data/example_datasets/hpo2ci.txt +2798 -0
- data/example_datasets/hummu_congenital_full_dataset.txt +4183 -0
- data/example_datasets/launch.sh +20 -0
- data/external_code/generate_boxpot.R +51 -21
- data/external_code/get_clusters.R +2 -2
- data/external_code/install_R_dependencies.R +16 -0
- data/external_code/plot_heatmap.R +34 -30
- data/lib/pets/coPatReporterMethods.rb +172 -424
- data/lib/pets/cohort.rb +309 -0
- data/lib/pets/common_optparse.rb +30 -0
- data/lib/pets/constants.rb +8 -0
- data/lib/pets/generalMethods.rb +29 -319
- data/lib/pets/genomic_features.rb +240 -0
- data/lib/pets/io.rb +481 -0
- data/lib/pets/parsers/cohort_parser.rb +111 -0
- data/lib/pets/parsers/reference_parser.rb +39 -0
- data/lib/pets/version.rb +1 -1
- data/lib/pets.rb +9 -0
- data/pets.gemspec +7 -3
- data/templates/cluster_report.erb +25 -5
- data/templates/cohort_report.erb +5 -7
- data/templates/evidence_profile.erb +20 -4
- data/templates/patient_report.erb +1 -1
- metadata +96 -5
@@ -1,130 +1,20 @@
|
|
1
|
-
require '
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
if !excluded_hpo_file.nil?
|
13
|
-
hpo = Ontology.new(file: hpo_file, load_file: true, removable_terms: read_excluded_hpo_file(excluded_hpo_file))
|
14
|
-
else
|
15
|
-
hpo = Ontology.new(file: hpo_file, load_file: true)
|
16
|
-
end
|
17
|
-
else
|
18
|
-
hpo = Ontology.new
|
19
|
-
hpo.read(hpo_file)
|
20
|
-
if !excluded_hpo_file.nil?
|
21
|
-
hpo.add_removable_terms(read_excluded_hpo_file(excluded_hpo_file))
|
22
|
-
hpo.remove_removable()
|
23
|
-
hpo.build_index()
|
24
|
-
end
|
25
|
-
end
|
26
|
-
return hpo
|
27
|
-
end
|
28
|
-
|
29
|
-
def format_patient_data(patient_data, options, hpo)
|
30
|
-
rejected_hpos = []
|
31
|
-
rejected_patients = []
|
32
|
-
patient_data.each do |pat_id, patient_record|
|
33
|
-
hpos, chr, start, stop = patient_record
|
34
|
-
|
35
|
-
if options[:hpo_names]
|
36
|
-
hpos, pat_rejected_hpos = hpo.translate_names(hpos)
|
37
|
-
if !pat_rejected_hpos.empty?
|
38
|
-
STDERR.puts "WARNING: patient #{pat_id} has the unknown hpo NAMES '#{pat_rejected_hpos.join(',')}'. Rejected."
|
39
|
-
rejected_hpos.concat(pat_rejected_hpos)
|
40
|
-
end
|
41
|
-
end
|
42
|
-
|
43
|
-
hpos, pat_rejected_hpos = hpo.check_ids(hpos.map{|a| a.to_sym})
|
44
|
-
if !pat_rejected_hpos.empty?
|
45
|
-
STDERR.puts "WARNING: patient #{pat_id} has the unknown hpo CODES '#{pat_rejected_hpos.join(',')}'. Rejected."
|
46
|
-
rejected_hpos.concat(pat_rejected_hpos)
|
47
|
-
end
|
48
|
-
if hpos.empty?
|
49
|
-
rejected_patients << pat_id
|
50
|
-
else
|
51
|
-
patient_record[HPOS] = hpos
|
52
|
-
end
|
53
|
-
end
|
54
|
-
return rejected_hpos.uniq, rejected_patients
|
55
|
-
end
|
56
|
-
|
57
|
-
def compute_hpo_list_and_childs(patient_data, hpo)
|
58
|
-
all_hpo = []
|
59
|
-
suggested_childs = {}
|
60
|
-
total_terms = 0
|
61
|
-
terms_with_more_specific_childs = 0
|
62
|
-
patient_data.each do |pat_id, hpos|
|
63
|
-
total_terms += hpos.length
|
64
|
-
more_specific_childs = hpo.get_childs_table(hpos, true)
|
65
|
-
terms_with_more_specific_childs += more_specific_childs.select{|hpo_record| !hpo_record.last.empty?}.length #Exclude phenotypes with no childs
|
66
|
-
suggested_childs[pat_id] = more_specific_childs
|
67
|
-
all_hpo.concat(hpos)
|
68
|
-
end
|
69
|
-
return all_hpo.uniq, suggested_childs, terms_with_more_specific_childs.fdiv(total_terms)
|
70
|
-
end
|
71
|
-
|
72
|
-
def clean_patient_profiles(hpo, patient_profiles)
|
73
|
-
rejected_patients = []
|
74
|
-
patient_profiles.each do |pat, prof|
|
75
|
-
phens = hpo.clean_profile_hard(prof)
|
76
|
-
if phens.empty?
|
77
|
-
rejected_patients << pat
|
78
|
-
else
|
79
|
-
patient_profiles[pat] = phens
|
80
|
-
end
|
81
|
-
end
|
82
|
-
patient_profiles.select!{|pat_id, patient_record| !rejected_patients.include?(pat_id)}
|
83
|
-
hpo.profiles = {}
|
84
|
-
hpo.load_profiles(patient_profiles)
|
85
|
-
|
86
|
-
end
|
87
|
-
|
88
|
-
def generate_patient_hpo_matrix(patient_data, cohort_hpos)
|
89
|
-
matrix = []
|
90
|
-
n = cohort_hpos.length
|
91
|
-
patient_data.each do |pat_id, pat_hpos|
|
92
|
-
vector = Array.new(n, 0)
|
93
|
-
pat_hpos.each do |hpo|
|
94
|
-
vector[cohort_hpos.index(hpo)] = 1
|
95
|
-
end
|
96
|
-
matrix << vector
|
97
|
-
end
|
98
|
-
return matrix
|
99
|
-
end
|
100
|
-
|
101
|
-
def generate_patient_hpo_matrix_numo(patient_data, cohort_hpos)
|
102
|
-
y_names = patient_data.keys
|
103
|
-
x_names = cohort_hpos
|
104
|
-
x_names_indx = {}
|
105
|
-
cohort_hpos.each_with_index{|hp,x| x_names_indx[hp]=x}
|
106
|
-
# row (y), cols (x)
|
107
|
-
matrix = Numo::DFloat.zeros(patient_data.length, cohort_hpos.length)
|
108
|
-
i = 0
|
109
|
-
patient_data.each do |pat_id, pat_hpos|
|
110
|
-
pat_hpos.each do |hp|
|
111
|
-
matrix[i, x_names_indx[hp]] = 1
|
112
|
-
end
|
113
|
-
i += 1
|
1
|
+
require 'expcalc'
|
2
|
+
def translate_codes(clusters, hpo)
|
3
|
+
translated_clusters = []
|
4
|
+
clusters.each do |clusterID, num_of_pats, patientIDs_ary, patient_hpos_ary|
|
5
|
+
translate_codes = patient_hpos_ary.map{|patient_hpos| patient_hpos.map{|hpo_code| hpo.translate_id(hpo_code)}}
|
6
|
+
translated_clusters << [clusterID,
|
7
|
+
num_of_pats,
|
8
|
+
patientIDs_ary,
|
9
|
+
patient_hpos_ary,
|
10
|
+
translate_codes
|
11
|
+
]
|
114
12
|
end
|
115
|
-
return
|
13
|
+
return translated_clusters
|
116
14
|
end
|
117
15
|
|
118
|
-
def
|
119
|
-
|
120
|
-
f.puts x_names.join("\t")
|
121
|
-
matrix.each_with_index do |row, i|
|
122
|
-
f.puts [y_names[i]].concat(row).join("\t")
|
123
|
-
end
|
124
|
-
end
|
125
|
-
end
|
126
|
-
|
127
|
-
def process_clustered_patients(options, clustered_patients, patient_uniq_profiles, patient_data, equivalence, hpo, phenotype_ic, patient_id_type) # get ic and chromosomes
|
16
|
+
def process_dummy_clustered_patients(options, clustered_patients, patient_data, phenotype_ic) # get ic and chromosomes
|
17
|
+
ont = Cohort.get_ontology(Cohort.act_ont)
|
128
18
|
all_ics = []
|
129
19
|
all_lengths = []
|
130
20
|
top_cluster_phenotypes = []
|
@@ -132,30 +22,9 @@ def process_clustered_patients(options, clustered_patients, patient_uniq_profile
|
|
132
22
|
multi_chromosome_patients = 0
|
133
23
|
processed_clusters = 0
|
134
24
|
clustered_patients.sort_by{|cl_id, pat_ids| pat_ids.length }.reverse.each do |cluster_id, patient_ids|
|
135
|
-
|
136
|
-
|
137
|
-
all_phens =
|
138
|
-
profile_ics = []
|
139
|
-
profile_lengths = []
|
140
|
-
processed_patients = []
|
141
|
-
patient_ids.each do |pat_id|
|
142
|
-
phenotypes = patient_uniq_profiles[pat_id]
|
143
|
-
#pat_id = pat_id.gsub(/_i\d+$/,'') if patient_id_type != 'generated'
|
144
|
-
processed_patients << pat_id
|
145
|
-
profile_ics << get_profile_ic(phenotypes, phenotype_ic)
|
146
|
-
profile_lengths << phenotypes.length
|
147
|
-
if processed_clusters < options[:clusters2show_detailed_phen_data]
|
148
|
-
phen_names, rejected_codes = hpo.translate_ids(phenotypes) #optional
|
149
|
-
all_phens << phen_names
|
150
|
-
end
|
151
|
-
variants = equivalence[pat_id]
|
152
|
-
variants.each do |variant|
|
153
|
-
variant_data = patient_data[variant]
|
154
|
-
chrs[variant_data[CHR]] += 1 if !options[:chromosome_col].nil? && variant_data[CHR] != '-'
|
155
|
-
end
|
156
|
-
end
|
157
|
-
num_of_patients = processed_patients.length
|
158
|
-
next if num_of_patients == 1 # Check that current cluster only has one patient with several mutations
|
25
|
+
num_of_patients = patient_ids.length
|
26
|
+
next if num_of_patients == 1
|
27
|
+
chrs, all_phens, profile_ics, profile_lengths = process_cluster(patient_ids, patient_data, phenotype_ic, options, ont, processed_clusters)
|
159
28
|
top_cluster_phenotypes << all_phens if processed_clusters < options[:clusters2show_detailed_phen_data]
|
160
29
|
all_ics << profile_ics
|
161
30
|
all_lengths << profile_lengths
|
@@ -170,167 +39,73 @@ def process_clustered_patients(options, clustered_patients, patient_uniq_profile
|
|
170
39
|
return all_ics, all_lengths, cluster_data_by_chromosomes, top_cluster_phenotypes, multi_chromosome_patients
|
171
40
|
end
|
172
41
|
|
42
|
+
def process_cluster(patient_ids, patient_data, phenotype_ic, options, ont, processed_clusters)
|
43
|
+
chrs = Hash.new(0)
|
44
|
+
all_phens = []
|
45
|
+
profile_ics = []
|
46
|
+
profile_lengths = []
|
47
|
+
patient_ids.each do |pat_id|
|
48
|
+
phenotypes = patient_data.get_profile(pat_id)
|
49
|
+
profile_ics << get_profile_ic(phenotypes, phenotype_ic)
|
50
|
+
profile_lengths << phenotypes.length
|
51
|
+
if processed_clusters < options[:clusters2show_detailed_phen_data]
|
52
|
+
phen_names, rejected_codes = ont.translate_ids(phenotypes) #optional
|
53
|
+
all_phens << phen_names
|
54
|
+
end
|
55
|
+
patient_data.get_vars(pat_id).get_chr.each{|chr| chrs[chr] += 1} if !options[:chromosome_col].nil?
|
56
|
+
end
|
57
|
+
return chrs, all_phens, profile_ics, profile_lengths
|
58
|
+
end
|
59
|
+
|
173
60
|
def get_profile_ic(hpo_names, phenotype_ic)
|
174
61
|
ic = 0
|
175
62
|
profile_length = 0
|
176
63
|
hpo_names.each do |hpo_id|
|
177
64
|
hpo_ic = phenotype_ic[hpo_id]
|
178
|
-
#
|
179
|
-
ic += hpo_ic
|
65
|
+
raise("The term #{hpo_id} not exists in the given ic table") if hpo_ic.nil?
|
66
|
+
ic += hpo_ic
|
180
67
|
profile_length += 1
|
181
68
|
end
|
182
69
|
profile_length = 1 if profile_length == 0
|
183
70
|
return ic.fdiv(profile_length)
|
184
71
|
end
|
185
72
|
|
186
|
-
def
|
187
|
-
File.open(cluster_ic_data_file, 'w') do |f|
|
188
|
-
f.puts %w[cluster_id ic Plen].join("\t")
|
189
|
-
all_ics.each_with_index do |cluster_ics, i|
|
190
|
-
break if i == limit
|
191
|
-
cluster_length = cluster_ics.length
|
192
|
-
cluster_ics.each_with_index do |clust_ic, j|
|
193
|
-
f.puts "#{cluster_length}_#{i}\t#{clust_ic}\t#{profile_lengths[i][j]}"
|
194
|
-
end
|
195
|
-
end
|
196
|
-
end
|
197
|
-
end
|
198
|
-
|
199
|
-
def write_cluster_chromosome_data(cluster_data, cluster_chromosome_data_file, limit)
|
200
|
-
File.open(cluster_chromosome_data_file, 'w') do |f|
|
201
|
-
f.puts %w[cluster_id chr count].join("\t")
|
202
|
-
index = 0
|
203
|
-
last_id = cluster_data.first.first unless cluster_data.empty?
|
204
|
-
cluster_data.each do |cluster_id, patient_number, chr, count|
|
205
|
-
index += 1 if cluster_id != last_id
|
206
|
-
break if index == limit
|
207
|
-
f.puts ["#{patient_number}_#{index}", chr, count].join("\t")
|
208
|
-
last_id = cluster_id
|
209
|
-
end
|
210
|
-
end
|
211
|
-
end
|
212
|
-
|
213
|
-
def write_coverage_data(coverage_to_plot, coverage_to_plot_file)
|
214
|
-
File.open(coverage_to_plot_file, 'w') do |f|
|
215
|
-
coverage_to_plot.each do |chr, position, freq|
|
216
|
-
f.puts "#{chr}\t#{position}\t#{freq}"
|
217
|
-
end
|
218
|
-
end
|
219
|
-
end
|
220
|
-
|
221
|
-
def get_uniq_hpo_profiles(patient_data) # To avoid duplications due to more one mutation in the same patient
|
222
|
-
hpo_profiles = {}
|
223
|
-
equivalence = {}
|
224
|
-
patient_data.each do |variant_id, patient_rec|
|
225
|
-
pat_id, count = variant_id.split('_i')
|
226
|
-
hpo_profiles[pat_id] = patient_rec[HPOS]
|
227
|
-
query = equivalence[pat_id]
|
228
|
-
if query.nil?
|
229
|
-
equivalence[pat_id] = [variant_id]
|
230
|
-
else
|
231
|
-
query << variant_id
|
232
|
-
end
|
233
|
-
end
|
234
|
-
return hpo_profiles, equivalence
|
235
|
-
end
|
236
|
-
|
237
|
-
def get_patient_ids(patient_data) # To aviod duplications due to more one mutation in the same patient
|
238
|
-
ids = []
|
239
|
-
patient_data.each do |pat_id, hpos|
|
240
|
-
id, count = pat_id.split('_i')
|
241
|
-
ids << id
|
242
|
-
end
|
243
|
-
return ids.uniq
|
244
|
-
end
|
245
|
-
|
246
|
-
def get_summary_stats(patient_data, rejected_patients, cohort_hpos, hpo)
|
73
|
+
def get_summary_stats(patient_data, rejected_patients, hpo_stats, fraction_terms_specific_childs, rejected_hpos)
|
247
74
|
stats = []
|
248
|
-
stats << ['Unique HPO terms',
|
249
|
-
stats << ['Cohort size',
|
75
|
+
stats << ['Unique HPO terms', hpo_stats.length]
|
76
|
+
stats << ['Cohort size', patient_data.profiles.length]
|
250
77
|
stats << ['Rejected patients by empty profile', rejected_patients.length]
|
251
|
-
|
252
|
-
stats << ['
|
253
|
-
stats << ['HPO
|
78
|
+
stats << ['HPOs per patient (average)', patient_data.get_profiles_mean_size]
|
79
|
+
stats << ['HPO terms per patient: percentile 90', patient_data.get_profile_length_at_percentile(perc=90)]
|
80
|
+
stats << ['Percentage of HPO with more specific children', (fraction_terms_specific_childs * 100).round(4)]
|
81
|
+
stats << ['DsI for uniq HP terms', patient_data.get_dataset_specifity_index('uniq')]
|
82
|
+
stats << ['DsI for frequency weigthed HP terms', patient_data.get_dataset_specifity_index('weigthed')]
|
83
|
+
stats << ['Number of unknown phenotypes', rejected_hpos.length]
|
254
84
|
return stats
|
255
85
|
end
|
256
86
|
|
257
|
-
def
|
87
|
+
def dummy_cluster_patients(patient_data, matrix_file, clust_pat_file)
|
258
88
|
if !File.exists?(matrix_file)
|
259
|
-
pat_hpo_matrix, pat_id, hp_id =
|
89
|
+
pat_hpo_matrix, pat_id, hp_id = patient_data.to_bmatrix
|
260
90
|
x_axis_file = matrix_file.gsub('.npy','_x.lst')
|
261
|
-
File.open(x_axis_file, 'w'){|f| f.print hp_id.join("\n") }
|
262
91
|
y_axis_file = matrix_file.gsub('.npy','_y.lst')
|
263
|
-
|
264
|
-
Npy.save(matrix_file, pat_hpo_matrix)
|
92
|
+
pat_hpo_matrix.save(matrix_file, hp_id, x_axis_file, pat_id, y_axis_file)
|
265
93
|
end
|
266
|
-
|
267
|
-
clustered_patients = load_clustered_patients(
|
94
|
+
system_call(EXTERNAL_CODE, 'get_clusters.R', "-d #{matrix_file} -o #{clust_pat_file} -y #{matrix_file.gsub('.npy','')}") if !File.exists?(clust_pat_file)
|
95
|
+
clustered_patients = load_clustered_patients(clust_pat_file)
|
268
96
|
return(clustered_patients)
|
269
97
|
end
|
270
98
|
|
271
|
-
def
|
272
|
-
|
273
|
-
|
274
|
-
|
275
|
-
|
276
|
-
|
277
|
-
|
278
|
-
|
279
|
-
def write_detailed_hpo_profile_evaluation(suggested_childs, detailed_profile_evaluation_file, summary_stats)
|
280
|
-
CSV.open(detailed_profile_evaluation_file, "wb") do |csv|
|
281
|
-
suggested_childs.each do |pat_id, suggestions|
|
282
|
-
warning = nil
|
283
|
-
warning = 'WARNING: Very few phenotypes' if suggestions.length < 4
|
284
|
-
csv << ["PATIENT #{pat_id}", "#{warning}"]
|
285
|
-
csv << ["CURRENT PHENOTYPES", "PUTATIVE MORE SPECIFIC PHENOTYPES"]
|
286
|
-
suggestions.each do |parent, childs|
|
287
|
-
parent_code, parent_name = parent
|
288
|
-
if childs.empty?
|
289
|
-
csv << ["#{parent_name} (#{parent_code})", '-']
|
290
|
-
else
|
291
|
-
parent_writed = false
|
292
|
-
childs.each do |child_code, child_name|
|
293
|
-
if !parent_writed
|
294
|
-
parent_field = "#{parent_name} (#{parent_code})"
|
295
|
-
parent_writed = true
|
296
|
-
else
|
297
|
-
parent_field = ""
|
298
|
-
end
|
299
|
-
csv << [parent_field, "#{child_name} (#{child_code})"]
|
300
|
-
end
|
301
|
-
end
|
302
|
-
end
|
303
|
-
csv << ["", ""]
|
99
|
+
def get_mean_size(all_sizes)
|
100
|
+
accumulated_size = 0
|
101
|
+
number = 0
|
102
|
+
all_sizes.each do |size, occurrences|
|
103
|
+
accumulated_size += size *occurrences
|
104
|
+
number += occurrences
|
304
105
|
end
|
305
|
-
|
106
|
+
return accumulated_size.fdiv(number)
|
306
107
|
end
|
307
108
|
|
308
|
-
def write_arrays4scatterplot(x_axis_value, y_axis_value, filename, x_axis_name, y_axis_name)
|
309
|
-
File.open(filename, 'w') do |f|
|
310
|
-
f.puts "#{x_axis_name}\t#{y_axis_name}"
|
311
|
-
x_axis_value.each_with_index do |value,i|
|
312
|
-
y_value = y_axis_value[i]
|
313
|
-
raise("The #{i} position is not presented in y_axis_value") if y_value.nil?
|
314
|
-
f.puts [value, y_value].join("\t")
|
315
|
-
end
|
316
|
-
end
|
317
|
-
end
|
318
|
-
|
319
|
-
def process_patient_data(patient_data)
|
320
|
-
parsed_patient_data = {}
|
321
|
-
patient_data.each do |patientID, metadata|
|
322
|
-
phenotypes, chr, start, stop = metadata
|
323
|
-
next if chr == '-'
|
324
|
-
info = [patientID, start.to_i, stop.to_i]
|
325
|
-
query = parsed_patient_data[chr]
|
326
|
-
if query.nil?
|
327
|
-
parsed_patient_data[chr] = [info]
|
328
|
-
else
|
329
|
-
query << info
|
330
|
-
end
|
331
|
-
end
|
332
|
-
return parsed_patient_data
|
333
|
-
end
|
334
109
|
|
335
110
|
def get_final_coverage(raw_coverage, bin_size)
|
336
111
|
coverage_to_plot = []
|
@@ -360,185 +135,158 @@ def get_sor_length_distribution(raw_coverage)
|
|
360
135
|
return all_cnvs_length
|
361
136
|
end
|
362
137
|
|
363
|
-
def get_cnvs_length(patient_data)
|
364
|
-
length_stats = Hash.new(0)
|
365
|
-
patient_data.each do |pat_id, patient_record|
|
366
|
-
string_hpos, chr, start, stop = patient_record
|
367
|
-
length_stats[stop - start] += 1
|
368
|
-
end
|
369
|
-
return length_stats.to_a.sort!{|stat| stat[1] <=> stat[1] }
|
370
|
-
end
|
371
|
-
|
372
|
-
|
373
138
|
def calculate_coverage(regions_data, delete_thresold = 0)
|
374
139
|
raw_coverage = {}
|
375
140
|
n_regions = 0
|
376
141
|
patients = 0
|
377
142
|
nt = 0
|
378
|
-
regions_data.each do |start, stop, chr,
|
379
|
-
number_of_patients =
|
143
|
+
regions_data.each do |start, stop, chr, reg_id|
|
144
|
+
number_of_patients = reg_id.split('.').last.to_i
|
380
145
|
if number_of_patients <= delete_thresold
|
381
146
|
number_of_patients = 0
|
382
147
|
else
|
383
148
|
n_regions += 1
|
384
|
-
patients += number_of_patients
|
385
149
|
nt += stop - start
|
386
150
|
end
|
387
|
-
|
388
|
-
|
389
|
-
if query.nil?
|
390
|
-
raw_coverage[chr] = [coords]
|
391
|
-
else
|
392
|
-
query << coords
|
393
|
-
end
|
151
|
+
add_record(raw_coverage, chr, [start, stop, number_of_patients])
|
152
|
+
patients += number_of_patients
|
394
153
|
end
|
395
154
|
return raw_coverage, n_regions, nt, patients.fdiv(n_regions)
|
396
155
|
end
|
397
156
|
|
398
|
-
def
|
399
|
-
|
400
|
-
|
401
|
-
|
402
|
-
|
403
|
-
|
404
|
-
|
405
|
-
|
406
|
-
profile_sizes, parental_hpos_per_profile = profile_sizes.zip(parental_hpos_per_profile).sort_by{|i| i.first}.reverse.transpose
|
407
|
-
return profile_sizes, parental_hpos_per_profile
|
408
|
-
end
|
409
|
-
|
410
|
-
def format_profiles_similarity_data(profiles_similarity)
|
411
|
-
matrix = []
|
412
|
-
element_names = profiles_similarity.keys
|
413
|
-
matrix << element_names
|
414
|
-
profiles_similarity.each do |elementA, relations|
|
415
|
-
row = [elementA]
|
416
|
-
element_names.each do |elementB|
|
417
|
-
if elementA == elementB
|
418
|
-
row << 'NA'
|
419
|
-
else
|
420
|
-
query = relations[elementB]
|
421
|
-
if !query.nil?
|
422
|
-
row << query
|
423
|
-
else
|
424
|
-
row << profiles_similarity[elementB][elementA]
|
425
|
-
end
|
157
|
+
def get_top_dummy_clusters_stats(top_clust_phen)
|
158
|
+
new_cluster_phenotypes = {}
|
159
|
+
top_clust_phen.each_with_index do |cluster, clusterID|
|
160
|
+
phenotypes_frequency = Hash.new(0)
|
161
|
+
total_patients = cluster.length
|
162
|
+
cluster.each do |phenotypes|
|
163
|
+
phenotypes.each do |p|
|
164
|
+
phenotypes_frequency[p] += 1
|
426
165
|
end
|
427
166
|
end
|
428
|
-
|
167
|
+
new_cluster_phenotypes[clusterID] = [total_patients, phenotypes_frequency.keys, phenotypes_frequency.values.map{|v| v.fdiv(total_patients) * 100}]
|
429
168
|
end
|
430
|
-
|
431
|
-
return matrix
|
169
|
+
return new_cluster_phenotypes
|
432
170
|
end
|
433
171
|
|
434
|
-
def
|
435
|
-
|
436
|
-
|
437
|
-
|
438
|
-
|
439
|
-
if elementA != elementB
|
440
|
-
pair = [elementA, elementB]
|
441
|
-
query = relations[elementB]
|
442
|
-
if !query.nil?
|
443
|
-
pair << query
|
444
|
-
else
|
445
|
-
pair << profiles_similarity[elementB][elementA]
|
446
|
-
end
|
447
|
-
pairs << pair
|
448
|
-
end
|
449
|
-
end
|
172
|
+
def remove_nested_entries(nested_hash)
|
173
|
+
empty_root_ids = []
|
174
|
+
nested_hash.each do |root_id, entries|
|
175
|
+
entries.select!{|id, val| yield(id, val)}
|
176
|
+
empty_root_ids << root_id if entries.empty?
|
450
177
|
end
|
451
|
-
|
452
|
-
end
|
453
|
-
|
454
|
-
def
|
455
|
-
|
456
|
-
|
457
|
-
|
458
|
-
|
459
|
-
|
460
|
-
|
461
|
-
|
462
|
-
|
463
|
-
|
464
|
-
|
465
|
-
|
466
|
-
|
178
|
+
empty_root_ids.each{|id| nested_hash.delete(id)}
|
179
|
+
end
|
180
|
+
|
181
|
+
def get_semantic_similarity_clustering(options, patient_data, temp_folder)
|
182
|
+
template = File.open(File.join(REPORT_FOLDER, 'cluster_report.erb')).read
|
183
|
+
hpo = Cohort.get_ontology(Cohort.act_ont)
|
184
|
+
reference_profiles = nil
|
185
|
+
reference_profiles = load_profiles(options[:reference_profiles], hpo) if !options[:reference_profiles].nil?
|
186
|
+
Parallel.each(options[:clustering_methods], in_processes: options[:threads] ) do |method_name|
|
187
|
+
matrix_filename = File.join(temp_folder, "similarity_matrix_#{method_name}.npy")
|
188
|
+
profiles_similarity_filename = File.join(temp_folder, ['profiles_similarity', method_name].join('_').concat('.txt'))
|
189
|
+
clusters_distribution_filename = File.join(temp_folder, ['clusters_distribution', method_name].join('_').concat('.txt'))
|
190
|
+
if !File.exists?(matrix_filename)
|
191
|
+
if reference_profiles.nil?
|
192
|
+
profiles_similarity = patient_data.compare_profiles(sim_type: method_name.to_sym, external_profiles: reference_profiles)
|
193
|
+
else # AS reference profiles are constant, the sematic comparation will be A => B (A reference). So, we have to invert the elements to perform the comparation
|
194
|
+
ont = Cohort.get_ontology(:hpo)
|
195
|
+
pat_profiles = ont.profiles
|
196
|
+
ont.load_profiles(reference_profiles, reset_stored: true)
|
197
|
+
profiles_similarity = ont.compare_profiles(sim_type: method_name.to_sym,
|
198
|
+
external_profiles: pat_profiles,
|
199
|
+
bidirectional: false)
|
200
|
+
ont.load_profiles(pat_profiles, reset_stored: true)
|
201
|
+
profiles_similarity = invert_nested_hash(profiles_similarity)
|
202
|
+
end
|
203
|
+
remove_nested_entries(profiles_similarity){|id, sim| sim >= options[:sim_thr] } if !options[:sim_thr].nil?
|
204
|
+
write_profile_pairs(profiles_similarity, profiles_similarity_filename)
|
205
|
+
if reference_profiles.nil?
|
206
|
+
axis_file = matrix_filename.gsub('.npy','.lst')
|
207
|
+
similarity_matrix, axis_names = profiles_similarity.to_wmatrix(squared: true, symm: true)
|
208
|
+
similarity_matrix.save(matrix_filename, axis_names, axis_file)
|
209
|
+
else
|
210
|
+
axis_file_x = matrix_filename.gsub('.npy','_x.lst')
|
211
|
+
axis_file_y = matrix_filename.gsub('.npy','_y.lst')
|
212
|
+
similarity_matrix, y_names, x_names = profiles_similarity.to_wmatrix(squared: false, symm: true)
|
213
|
+
similarity_matrix.save(matrix_filename, y_names, axis_file_y, x_names, axis_file_x)
|
467
214
|
end
|
468
215
|
end
|
469
|
-
|
470
|
-
|
471
|
-
|
472
|
-
|
473
|
-
|
474
|
-
def write_similarity_matrix(similarity_matrix, similarity_matrix_file)
|
475
|
-
File.open(similarity_matrix_file, 'w') do |f|
|
476
|
-
similarity_matrix.each do |row|
|
477
|
-
f.puts row.join("\t")
|
216
|
+
ext_var = ''
|
217
|
+
if method_name == 'resnik'
|
218
|
+
ext_var = '-m max'
|
219
|
+
elsif method_name == 'lin'
|
220
|
+
ext_var = '-m comp1'
|
478
221
|
end
|
479
|
-
|
480
|
-
|
481
|
-
|
482
|
-
|
483
|
-
|
484
|
-
|
485
|
-
|
486
|
-
|
222
|
+
cluster_file = "#{method_name}_clusters.txt"
|
223
|
+
if !reference_profiles.nil?
|
224
|
+
ext_var << ' -s'
|
225
|
+
axis_file = "#{axis_file_y},#{axis_file_x}"
|
226
|
+
cluster_file = "#{method_name}_clusters_rows.txt"
|
227
|
+
end
|
228
|
+
out_file = File.join(temp_folder, method_name)
|
229
|
+
system_call(EXTERNAL_CODE, 'plot_heatmap.R', "-y #{axis_file} -d #{matrix_filename} -o #{out_file} -M #{options[:minClusterProportion]} -t dynamic -H #{ext_var}") if !File.exists?(out_file + '_heatmap.png')
|
230
|
+
clusters_codes, clusters_info = parse_clusters_file(File.join(temp_folder, cluster_file), patient_data)
|
231
|
+
write_patient_hpo_stat(get_cluster_metadata(clusters_info), clusters_distribution_filename)
|
232
|
+
out_file = File.join(temp_folder, ['clusters_distribution', method_name].join('_'))
|
233
|
+
system_call(EXTERNAL_CODE, 'xyplot_graph.R', "-d #{clusters_distribution_filename} -o #{out_file} -x PatientsNumber -y HPOAverage") if !File.exists?(out_file)
|
234
|
+
sim_mat4cluster = {}
|
235
|
+
if options[:detailed_clusters]
|
236
|
+
clusters_codes.each do |cluster|
|
237
|
+
cluster_cohort = Cohort.new
|
238
|
+
clID, patient_number, patient_ids, hpo_codes = cluster
|
239
|
+
patient_ids.each_with_index {|patID, i| cluster_cohort.add_record([patID, hpo_codes[i], []])}
|
240
|
+
cluster_profiles = cluster_cohort.profiles
|
241
|
+
ref_profile = cluster_cohort.get_general_profile
|
242
|
+
hpo.load_profiles({ref: ref_profile}, reset_stored: true)
|
243
|
+
similarities = hpo.compare_profiles(external_profiles: cluster_profiles, sim_type: :lin, bidirectional: false)
|
244
|
+
candidate_sim_matrix, candidates, candidates_ids = get_similarity_matrix(ref_profile, similarities[:ref], cluster_profiles, hpo, 100, 100)
|
245
|
+
candidate_sim_matrix.unshift(['HP'] + candidates_ids)
|
246
|
+
sim_mat4cluster[clID] = candidate_sim_matrix
|
487
247
|
end
|
488
248
|
end
|
489
|
-
end
|
490
|
-
end
|
491
249
|
|
492
|
-
|
493
|
-
|
494
|
-
|
495
|
-
|
496
|
-
|
497
|
-
|
498
|
-
|
499
|
-
|
500
|
-
|
501
|
-
|
502
|
-
|
503
|
-
|
504
|
-
|
505
|
-
|
506
|
-
clusters_info.each do |clusterID, patients_info|
|
507
|
-
patients_per_cluster = patients_info.keys.length
|
508
|
-
clusters_table << [clusterID, patients_per_cluster, patients_info.keys, patients_info.values]
|
250
|
+
|
251
|
+
clusters = translate_codes(clusters_codes, hpo)
|
252
|
+
container = {
|
253
|
+
:temp_folder => temp_folder,
|
254
|
+
:cluster_name => method_name,
|
255
|
+
:clusters => clusters,
|
256
|
+
:hpo => hpo,
|
257
|
+
:sim_mat4cluster => sim_mat4cluster
|
258
|
+
}
|
259
|
+
|
260
|
+
report = Report_html.new(container, 'Patient clusters report')
|
261
|
+
report.build(template)
|
262
|
+
report.write(options[:output_file]+"_#{method_name}_clusters.html")
|
263
|
+
system_call(EXTERNAL_CODE, 'generate_boxpot.R', "-i #{temp_folder} -m #{method_name} -o #{File.join(temp_folder, method_name + '_sim_boxplot')}") if !File.exists?(File.join(temp_folder, 'sim_boxplot.png'))
|
509
264
|
end
|
510
|
-
return clusters_table, clusters_info
|
511
265
|
end
|
512
266
|
|
513
|
-
def
|
514
|
-
|
515
|
-
|
516
|
-
|
517
|
-
|
518
|
-
|
519
|
-
|
520
|
-
|
521
|
-
|
522
|
-
|
267
|
+
def invert_nested_hash(h)
|
268
|
+
new_h = {}
|
269
|
+
h.each do |k1, vals1|
|
270
|
+
vals1.each do |v1|
|
271
|
+
vals1.each do |k2, vals2|
|
272
|
+
query = new_h[k2]
|
273
|
+
if query.nil?
|
274
|
+
new_h[k2] = {k1 => vals2}
|
275
|
+
else
|
276
|
+
query[k1] = vals2
|
277
|
+
end
|
278
|
+
end
|
523
279
|
end
|
524
280
|
end
|
281
|
+
return new_h
|
525
282
|
end
|
526
283
|
|
527
|
-
def get_cluster_metadata(clusters_info
|
284
|
+
def get_cluster_metadata(clusters_info)
|
528
285
|
average_hp_per_pat_distribution = []
|
529
|
-
tmp = []
|
530
286
|
clusters_info.each do |cl_id, pat_info|
|
531
287
|
hp_per_pat_in_clust = pat_info.values.map{|a| a.length}
|
532
288
|
hp_per_pat_ave = hp_per_pat_in_clust.sum.fdiv(hp_per_pat_in_clust.length)
|
533
289
|
average_hp_per_pat_distribution << [pat_info.length, hp_per_pat_ave]
|
534
|
-
tmp << hp_per_pat_in_clust
|
535
|
-
end
|
536
|
-
total_clusters = clusters_info.length
|
537
|
-
average_phenotypes_by_cluster = tmp.flatten.sum.fdiv(total_clusters)
|
538
|
-
File.open(output_file, 'w') do |f|
|
539
|
-
f.puts "#{'PatientsNumber'}\t#{'HPOAverage'}"
|
540
|
-
average_hp_per_pat_distribution.each do |patient_num, ave|
|
541
|
-
f.puts "#{patient_num}\t#{ave}"
|
542
|
-
end
|
543
290
|
end
|
291
|
+
return average_hp_per_pat_distribution
|
544
292
|
end
|