pets 0.2.3 → 0.2.4
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Gemfile +2 -0
- data/README.md +79 -5
- data/bin/coPatReporter.rb +63 -156
- data/bin/comPatMondo.rb +1 -4
- data/bin/evidence_profiler.rb +38 -151
- data/bin/get_network_nodes.rb +79 -132
- data/bin/get_sorted_profs.rb +25 -36
- data/bin/install_deps.rb +7 -0
- data/bin/paco_translator.rb +29 -72
- data/bin/phen2reg.rb +1 -4
- data/bin/profiles2phenopacket.rb +110 -0
- data/bin/reg2phen.rb +1 -3
- data/example_datasets/associations_file.txt +757 -0
- data/example_datasets/example_patient.txt +6 -0
- data/example_datasets/example_patient_hpos.txt +15 -0
- data/example_datasets/genes.txt +8 -0
- data/example_datasets/hpo2ci.txt +2798 -0
- data/example_datasets/hummu_congenital_full_dataset.txt +4183 -0
- data/example_datasets/launch.sh +20 -0
- data/external_code/generate_boxpot.R +51 -21
- data/external_code/get_clusters.R +2 -2
- data/external_code/install_R_dependencies.R +11 -0
- data/external_code/plot_heatmap.R +34 -30
- data/lib/pets/coPatReporterMethods.rb +143 -441
- data/lib/pets/cohort.rb +307 -0
- data/lib/pets/constants.rb +7 -0
- data/lib/pets/generalMethods.rb +8 -317
- data/lib/pets/genomic_features.rb +144 -0
- data/lib/pets/io.rb +457 -0
- data/lib/pets/parsers/cohort_parser.rb +106 -0
- data/lib/pets/version.rb +1 -1
- data/lib/pets.rb +8 -0
- data/pets.gemspec +1 -0
- data/templates/cohort_report.erb +5 -7
- data/templates/patient_report.erb +1 -1
- metadata +34 -3
@@ -1,130 +1,20 @@
|
|
1
|
-
require '
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
hpo = Ontology.new
|
19
|
-
hpo.read(hpo_file)
|
20
|
-
if !excluded_hpo_file.nil?
|
21
|
-
hpo.add_removable_terms(read_excluded_hpo_file(excluded_hpo_file))
|
22
|
-
hpo.remove_removable()
|
23
|
-
hpo.build_index()
|
24
|
-
end
|
25
|
-
end
|
26
|
-
return hpo
|
27
|
-
end
|
28
|
-
|
29
|
-
def format_patient_data(patient_data, options, hpo)
|
30
|
-
rejected_hpos = []
|
31
|
-
rejected_patients = []
|
32
|
-
patient_data.each do |pat_id, patient_record|
|
33
|
-
hpos, chr, start, stop = patient_record
|
34
|
-
|
35
|
-
if options[:hpo_names]
|
36
|
-
hpos, pat_rejected_hpos = hpo.translate_names(hpos)
|
37
|
-
if !pat_rejected_hpos.empty?
|
38
|
-
STDERR.puts "WARNING: patient #{pat_id} has the unknown hpo NAMES '#{pat_rejected_hpos.join(',')}'. Rejected."
|
39
|
-
rejected_hpos.concat(pat_rejected_hpos)
|
40
|
-
end
|
41
|
-
end
|
42
|
-
|
43
|
-
hpos, pat_rejected_hpos = hpo.check_ids(hpos.map{|a| a.to_sym})
|
44
|
-
if !pat_rejected_hpos.empty?
|
45
|
-
STDERR.puts "WARNING: patient #{pat_id} has the unknown hpo CODES '#{pat_rejected_hpos.join(',')}'. Rejected."
|
46
|
-
rejected_hpos.concat(pat_rejected_hpos)
|
47
|
-
end
|
48
|
-
if hpos.empty?
|
49
|
-
rejected_patients << pat_id
|
50
|
-
else
|
51
|
-
patient_record[HPOS] = hpos
|
52
|
-
end
|
53
|
-
end
|
54
|
-
return rejected_hpos.uniq, rejected_patients
|
55
|
-
end
|
56
|
-
|
57
|
-
def compute_hpo_list_and_childs(patient_data, hpo)
|
58
|
-
all_hpo = []
|
59
|
-
suggested_childs = {}
|
60
|
-
total_terms = 0
|
61
|
-
terms_with_more_specific_childs = 0
|
62
|
-
patient_data.each do |pat_id, hpos|
|
63
|
-
total_terms += hpos.length
|
64
|
-
more_specific_childs = hpo.get_childs_table(hpos, true)
|
65
|
-
terms_with_more_specific_childs += more_specific_childs.select{|hpo_record| !hpo_record.last.empty?}.length #Exclude phenotypes with no childs
|
66
|
-
suggested_childs[pat_id] = more_specific_childs
|
67
|
-
all_hpo.concat(hpos)
|
68
|
-
end
|
69
|
-
return all_hpo.uniq, suggested_childs, terms_with_more_specific_childs.fdiv(total_terms)
|
70
|
-
end
|
71
|
-
|
72
|
-
def clean_patient_profiles(hpo, patient_profiles)
|
73
|
-
rejected_patients = []
|
74
|
-
patient_profiles.each do |pat, prof|
|
75
|
-
phens = hpo.clean_profile_hard(prof)
|
76
|
-
if phens.empty?
|
77
|
-
rejected_patients << pat
|
78
|
-
else
|
79
|
-
patient_profiles[pat] = phens
|
80
|
-
end
|
81
|
-
end
|
82
|
-
patient_profiles.select!{|pat_id, patient_record| !rejected_patients.include?(pat_id)}
|
83
|
-
hpo.profiles = {}
|
84
|
-
hpo.load_profiles(patient_profiles)
|
85
|
-
|
86
|
-
end
|
87
|
-
|
88
|
-
def generate_patient_hpo_matrix(patient_data, cohort_hpos)
|
89
|
-
matrix = []
|
90
|
-
n = cohort_hpos.length
|
91
|
-
patient_data.each do |pat_id, pat_hpos|
|
92
|
-
vector = Array.new(n, 0)
|
93
|
-
pat_hpos.each do |hpo|
|
94
|
-
vector[cohort_hpos.index(hpo)] = 1
|
95
|
-
end
|
96
|
-
matrix << vector
|
97
|
-
end
|
98
|
-
return matrix
|
99
|
-
end
|
100
|
-
|
101
|
-
def generate_patient_hpo_matrix_numo(patient_data, cohort_hpos)
|
102
|
-
y_names = patient_data.keys
|
103
|
-
x_names = cohort_hpos
|
104
|
-
x_names_indx = {}
|
105
|
-
cohort_hpos.each_with_index{|hp,x| x_names_indx[hp]=x}
|
106
|
-
# row (y), cols (x)
|
107
|
-
matrix = Numo::DFloat.zeros(patient_data.length, cohort_hpos.length)
|
108
|
-
i = 0
|
109
|
-
patient_data.each do |pat_id, pat_hpos|
|
110
|
-
pat_hpos.each do |hp|
|
111
|
-
matrix[i, x_names_indx[hp]] = 1
|
112
|
-
end
|
113
|
-
i += 1
|
114
|
-
end
|
115
|
-
return matrix, y_names, x_names
|
116
|
-
end
|
117
|
-
|
118
|
-
def write_matrix_for_R(matrix, x_names, y_names, file)
|
119
|
-
File.open(file, 'w') do |f|
|
120
|
-
f.puts x_names.join("\t")
|
121
|
-
matrix.each_with_index do |row, i|
|
122
|
-
f.puts [y_names[i]].concat(row).join("\t")
|
123
|
-
end
|
124
|
-
end
|
125
|
-
end
|
126
|
-
|
127
|
-
def process_clustered_patients(options, clustered_patients, patient_uniq_profiles, patient_data, equivalence, hpo, phenotype_ic, patient_id_type) # get ic and chromosomes
|
1
|
+
require 'expcalc'
|
2
|
+
def translate_codes(clusters, hpo)
|
3
|
+
translated_clusters = []
|
4
|
+
clusters.each do |clusterID, num_of_pats, patientIDs_ary, patient_hpos_ary|
|
5
|
+
translate_codes = patient_hpos_ary.map{|patient_hpos| patient_hpos.map{|hpo_code| hpo.translate_id(hpo_code)}}
|
6
|
+
translated_clusters << [clusterID,
|
7
|
+
num_of_pats,
|
8
|
+
patientIDs_ary,
|
9
|
+
patient_hpos_ary,
|
10
|
+
translate_codes
|
11
|
+
]
|
12
|
+
end
|
13
|
+
return translated_clusters
|
14
|
+
end
|
15
|
+
|
16
|
+
def process_dummy_clustered_patients(options, clustered_patients, patient_data, phenotype_ic) # get ic and chromosomes
|
17
|
+
ont = Cohort.get_ontology(Cohort.act_ont)
|
128
18
|
all_ics = []
|
129
19
|
all_lengths = []
|
130
20
|
top_cluster_phenotypes = []
|
@@ -132,30 +22,9 @@ def process_clustered_patients(options, clustered_patients, patient_uniq_profile
|
|
132
22
|
multi_chromosome_patients = 0
|
133
23
|
processed_clusters = 0
|
134
24
|
clustered_patients.sort_by{|cl_id, pat_ids| pat_ids.length }.reverse.each do |cluster_id, patient_ids|
|
135
|
-
|
136
|
-
|
137
|
-
all_phens =
|
138
|
-
profile_ics = []
|
139
|
-
profile_lengths = []
|
140
|
-
processed_patients = []
|
141
|
-
patient_ids.each do |pat_id|
|
142
|
-
phenotypes = patient_uniq_profiles[pat_id]
|
143
|
-
#pat_id = pat_id.gsub(/_i\d+$/,'') if patient_id_type != 'generated'
|
144
|
-
processed_patients << pat_id
|
145
|
-
profile_ics << get_profile_ic(phenotypes, phenotype_ic)
|
146
|
-
profile_lengths << phenotypes.length
|
147
|
-
if processed_clusters < options[:clusters2show_detailed_phen_data]
|
148
|
-
phen_names, rejected_codes = hpo.translate_ids(phenotypes) #optional
|
149
|
-
all_phens << phen_names
|
150
|
-
end
|
151
|
-
variants = equivalence[pat_id]
|
152
|
-
variants.each do |variant|
|
153
|
-
variant_data = patient_data[variant]
|
154
|
-
chrs[variant_data[CHR]] += 1 if !options[:chromosome_col].nil? && variant_data[CHR] != '-'
|
155
|
-
end
|
156
|
-
end
|
157
|
-
num_of_patients = processed_patients.length
|
158
|
-
next if num_of_patients == 1 # Check that current cluster only has one patient with several mutations
|
25
|
+
num_of_patients = patient_ids.length
|
26
|
+
next if num_of_patients == 1
|
27
|
+
chrs, all_phens, profile_ics, profile_lengths = process_cluster(patient_ids, patient_data, phenotype_ic, options, ont, processed_clusters)
|
159
28
|
top_cluster_phenotypes << all_phens if processed_clusters < options[:clusters2show_detailed_phen_data]
|
160
29
|
all_ics << profile_ics
|
161
30
|
all_lengths << profile_lengths
|
@@ -170,12 +39,29 @@ def process_clustered_patients(options, clustered_patients, patient_uniq_profile
|
|
170
39
|
return all_ics, all_lengths, cluster_data_by_chromosomes, top_cluster_phenotypes, multi_chromosome_patients
|
171
40
|
end
|
172
41
|
|
42
|
+
def process_cluster(patient_ids, patient_data, phenotype_ic, options, ont, processed_clusters)
|
43
|
+
chrs = Hash.new(0)
|
44
|
+
all_phens = []
|
45
|
+
profile_ics = []
|
46
|
+
profile_lengths = []
|
47
|
+
patient_ids.each do |pat_id|
|
48
|
+
phenotypes = patient_data.get_profile(pat_id)
|
49
|
+
profile_ics << get_profile_ic(phenotypes, phenotype_ic)
|
50
|
+
profile_lengths << phenotypes.length
|
51
|
+
if processed_clusters < options[:clusters2show_detailed_phen_data]
|
52
|
+
phen_names, rejected_codes = ont.translate_ids(phenotypes) #optional
|
53
|
+
all_phens << phen_names
|
54
|
+
end
|
55
|
+
patient_data.get_vars(pat_id).get_chr.each{|chr| chrs[chr] += 1} if !options[:chromosome_col].nil?
|
56
|
+
end
|
57
|
+
return chrs, all_phens, profile_ics, profile_lengths
|
58
|
+
end
|
59
|
+
|
173
60
|
def get_profile_ic(hpo_names, phenotype_ic)
|
174
61
|
ic = 0
|
175
62
|
profile_length = 0
|
176
63
|
hpo_names.each do |hpo_id|
|
177
64
|
hpo_ic = phenotype_ic[hpo_id]
|
178
|
-
# STDERR.puts phenotype_ic.inspect
|
179
65
|
ic += hpo_ic if !hpo_ic.nil?
|
180
66
|
profile_length += 1
|
181
67
|
end
|
@@ -183,154 +69,42 @@ def get_profile_ic(hpo_names, phenotype_ic)
|
|
183
69
|
return ic.fdiv(profile_length)
|
184
70
|
end
|
185
71
|
|
186
|
-
def
|
187
|
-
File.open(cluster_ic_data_file, 'w') do |f|
|
188
|
-
f.puts %w[cluster_id ic Plen].join("\t")
|
189
|
-
all_ics.each_with_index do |cluster_ics, i|
|
190
|
-
break if i == limit
|
191
|
-
cluster_length = cluster_ics.length
|
192
|
-
cluster_ics.each_with_index do |clust_ic, j|
|
193
|
-
f.puts "#{cluster_length}_#{i}\t#{clust_ic}\t#{profile_lengths[i][j]}"
|
194
|
-
end
|
195
|
-
end
|
196
|
-
end
|
197
|
-
end
|
198
|
-
|
199
|
-
def write_cluster_chromosome_data(cluster_data, cluster_chromosome_data_file, limit)
|
200
|
-
File.open(cluster_chromosome_data_file, 'w') do |f|
|
201
|
-
f.puts %w[cluster_id chr count].join("\t")
|
202
|
-
index = 0
|
203
|
-
last_id = cluster_data.first.first unless cluster_data.empty?
|
204
|
-
cluster_data.each do |cluster_id, patient_number, chr, count|
|
205
|
-
index += 1 if cluster_id != last_id
|
206
|
-
break if index == limit
|
207
|
-
f.puts ["#{patient_number}_#{index}", chr, count].join("\t")
|
208
|
-
last_id = cluster_id
|
209
|
-
end
|
210
|
-
end
|
211
|
-
end
|
212
|
-
|
213
|
-
def write_coverage_data(coverage_to_plot, coverage_to_plot_file)
|
214
|
-
File.open(coverage_to_plot_file, 'w') do |f|
|
215
|
-
coverage_to_plot.each do |chr, position, freq|
|
216
|
-
f.puts "#{chr}\t#{position}\t#{freq}"
|
217
|
-
end
|
218
|
-
end
|
219
|
-
end
|
220
|
-
|
221
|
-
def get_uniq_hpo_profiles(patient_data) # To avoid duplications due to more one mutation in the same patient
|
222
|
-
hpo_profiles = {}
|
223
|
-
equivalence = {}
|
224
|
-
patient_data.each do |variant_id, patient_rec|
|
225
|
-
pat_id, count = variant_id.split('_i')
|
226
|
-
hpo_profiles[pat_id] = patient_rec[HPOS]
|
227
|
-
query = equivalence[pat_id]
|
228
|
-
if query.nil?
|
229
|
-
equivalence[pat_id] = [variant_id]
|
230
|
-
else
|
231
|
-
query << variant_id
|
232
|
-
end
|
233
|
-
end
|
234
|
-
return hpo_profiles, equivalence
|
235
|
-
end
|
236
|
-
|
237
|
-
def get_patient_ids(patient_data) # To aviod duplications due to more one mutation in the same patient
|
238
|
-
ids = []
|
239
|
-
patient_data.each do |pat_id, hpos|
|
240
|
-
id, count = pat_id.split('_i')
|
241
|
-
ids << id
|
242
|
-
end
|
243
|
-
return ids.uniq
|
244
|
-
end
|
245
|
-
|
246
|
-
def get_summary_stats(patient_data, rejected_patients, cohort_hpos, hpo)
|
72
|
+
def get_summary_stats(patient_data, rejected_patients, hpo_stats, fraction_terms_specific_childs, rejected_hpos)
|
247
73
|
stats = []
|
248
|
-
stats << ['Unique HPO terms',
|
249
|
-
stats << ['Cohort size',
|
74
|
+
stats << ['Unique HPO terms', hpo_stats.length]
|
75
|
+
stats << ['Cohort size', patient_data.profiles.length]
|
250
76
|
stats << ['Rejected patients by empty profile', rejected_patients.length]
|
251
|
-
|
252
|
-
stats << ['
|
253
|
-
stats << ['HPO
|
77
|
+
stats << ['HPOs per patient (average)', patient_data.get_profiles_mean_size]
|
78
|
+
stats << ['HPO terms per patient: percentile 90', patient_data.get_profile_length_at_percentile(perc=90)]
|
79
|
+
stats << ['Percentage of HPO with more specific children', (fraction_terms_specific_childs * 100).round(4)]
|
80
|
+
stats << ['DsI for uniq HP terms', patient_data.get_dataset_specifity_index('uniq')]
|
81
|
+
stats << ['DsI for frequency weigthed HP terms', patient_data.get_dataset_specifity_index('weigthed')]
|
82
|
+
stats << ['Number of unknown phenotypes', rejected_hpos.length]
|
254
83
|
return stats
|
255
84
|
end
|
256
85
|
|
257
|
-
def
|
86
|
+
def dummy_cluster_patients(patient_data, matrix_file, clust_pat_file)
|
258
87
|
if !File.exists?(matrix_file)
|
259
|
-
pat_hpo_matrix, pat_id, hp_id =
|
88
|
+
pat_hpo_matrix, pat_id, hp_id = patient_data.to_bmatrix
|
260
89
|
x_axis_file = matrix_file.gsub('.npy','_x.lst')
|
261
|
-
File.open(x_axis_file, 'w'){|f| f.print hp_id.join("\n") }
|
262
90
|
y_axis_file = matrix_file.gsub('.npy','_y.lst')
|
263
|
-
|
264
|
-
Npy.save(matrix_file, pat_hpo_matrix)
|
91
|
+
pat_hpo_matrix.save(matrix_file, hp_id, x_axis_file, pat_id, y_axis_file)
|
265
92
|
end
|
266
|
-
|
267
|
-
clustered_patients = load_clustered_patients(
|
93
|
+
system_call(EXTERNAL_CODE, 'get_clusters.R', "-d #{matrix_file} -o #{clust_pat_file} -y #{matrix_file.gsub('.npy','')}") if !File.exists?(clust_pat_file)
|
94
|
+
clustered_patients = load_clustered_patients(clust_pat_file)
|
268
95
|
return(clustered_patients)
|
269
96
|
end
|
270
97
|
|
271
|
-
def
|
272
|
-
|
273
|
-
|
274
|
-
|
275
|
-
|
276
|
-
|
277
|
-
|
278
|
-
|
279
|
-
def write_detailed_hpo_profile_evaluation(suggested_childs, detailed_profile_evaluation_file, summary_stats)
|
280
|
-
CSV.open(detailed_profile_evaluation_file, "wb") do |csv|
|
281
|
-
suggested_childs.each do |pat_id, suggestions|
|
282
|
-
warning = nil
|
283
|
-
warning = 'WARNING: Very few phenotypes' if suggestions.length < 4
|
284
|
-
csv << ["PATIENT #{pat_id}", "#{warning}"]
|
285
|
-
csv << ["CURRENT PHENOTYPES", "PUTATIVE MORE SPECIFIC PHENOTYPES"]
|
286
|
-
suggestions.each do |parent, childs|
|
287
|
-
parent_code, parent_name = parent
|
288
|
-
if childs.empty?
|
289
|
-
csv << ["#{parent_name} (#{parent_code})", '-']
|
290
|
-
else
|
291
|
-
parent_writed = false
|
292
|
-
childs.each do |child_code, child_name|
|
293
|
-
if !parent_writed
|
294
|
-
parent_field = "#{parent_name} (#{parent_code})"
|
295
|
-
parent_writed = true
|
296
|
-
else
|
297
|
-
parent_field = ""
|
298
|
-
end
|
299
|
-
csv << [parent_field, "#{child_name} (#{child_code})"]
|
300
|
-
end
|
301
|
-
end
|
302
|
-
end
|
303
|
-
csv << ["", ""]
|
98
|
+
def get_mean_size(all_sizes)
|
99
|
+
accumulated_size = 0
|
100
|
+
number = 0
|
101
|
+
all_sizes.each do |size, occurrences|
|
102
|
+
accumulated_size += size *occurrences
|
103
|
+
number += occurrences
|
304
104
|
end
|
305
|
-
|
105
|
+
return accumulated_size.fdiv(number)
|
306
106
|
end
|
307
107
|
|
308
|
-
def write_arrays4scatterplot(x_axis_value, y_axis_value, filename, x_axis_name, y_axis_name)
|
309
|
-
File.open(filename, 'w') do |f|
|
310
|
-
f.puts "#{x_axis_name}\t#{y_axis_name}"
|
311
|
-
x_axis_value.each_with_index do |value,i|
|
312
|
-
y_value = y_axis_value[i]
|
313
|
-
raise("The #{i} position is not presented in y_axis_value") if y_value.nil?
|
314
|
-
f.puts [value, y_value].join("\t")
|
315
|
-
end
|
316
|
-
end
|
317
|
-
end
|
318
|
-
|
319
|
-
def process_patient_data(patient_data)
|
320
|
-
parsed_patient_data = {}
|
321
|
-
patient_data.each do |patientID, metadata|
|
322
|
-
phenotypes, chr, start, stop = metadata
|
323
|
-
next if chr == '-'
|
324
|
-
info = [patientID, start.to_i, stop.to_i]
|
325
|
-
query = parsed_patient_data[chr]
|
326
|
-
if query.nil?
|
327
|
-
parsed_patient_data[chr] = [info]
|
328
|
-
else
|
329
|
-
query << info
|
330
|
-
end
|
331
|
-
end
|
332
|
-
return parsed_patient_data
|
333
|
-
end
|
334
108
|
|
335
109
|
def get_final_coverage(raw_coverage, bin_size)
|
336
110
|
coverage_to_plot = []
|
@@ -360,185 +134,113 @@ def get_sor_length_distribution(raw_coverage)
|
|
360
134
|
return all_cnvs_length
|
361
135
|
end
|
362
136
|
|
363
|
-
def get_cnvs_length(patient_data)
|
364
|
-
length_stats = Hash.new(0)
|
365
|
-
patient_data.each do |pat_id, patient_record|
|
366
|
-
string_hpos, chr, start, stop = patient_record
|
367
|
-
length_stats[stop - start] += 1
|
368
|
-
end
|
369
|
-
return length_stats.to_a.sort!{|stat| stat[1] <=> stat[1] }
|
370
|
-
end
|
371
|
-
|
372
|
-
|
373
137
|
def calculate_coverage(regions_data, delete_thresold = 0)
|
374
138
|
raw_coverage = {}
|
375
139
|
n_regions = 0
|
376
140
|
patients = 0
|
377
141
|
nt = 0
|
378
|
-
regions_data.each do |start, stop, chr,
|
379
|
-
number_of_patients =
|
142
|
+
regions_data.each do |start, stop, chr, reg_id|
|
143
|
+
number_of_patients = reg_id.split('.').last.to_i
|
380
144
|
if number_of_patients <= delete_thresold
|
381
145
|
number_of_patients = 0
|
382
146
|
else
|
383
147
|
n_regions += 1
|
384
|
-
patients += number_of_patients
|
385
148
|
nt += stop - start
|
386
149
|
end
|
387
|
-
|
388
|
-
|
389
|
-
if query.nil?
|
390
|
-
raw_coverage[chr] = [coords]
|
391
|
-
else
|
392
|
-
query << coords
|
393
|
-
end
|
150
|
+
add_record(raw_coverage, chr, [start, stop, number_of_patients])
|
151
|
+
patients += number_of_patients
|
394
152
|
end
|
395
153
|
return raw_coverage, n_regions, nt, patients.fdiv(n_regions)
|
396
154
|
end
|
397
155
|
|
398
|
-
def
|
399
|
-
|
400
|
-
|
401
|
-
|
402
|
-
|
403
|
-
|
404
|
-
|
405
|
-
|
406
|
-
profile_sizes, parental_hpos_per_profile = profile_sizes.zip(parental_hpos_per_profile).sort_by{|i| i.first}.reverse.transpose
|
407
|
-
return profile_sizes, parental_hpos_per_profile
|
408
|
-
end
|
409
|
-
|
410
|
-
def format_profiles_similarity_data(profiles_similarity)
|
411
|
-
matrix = []
|
412
|
-
element_names = profiles_similarity.keys
|
413
|
-
matrix << element_names
|
414
|
-
profiles_similarity.each do |elementA, relations|
|
415
|
-
row = [elementA]
|
416
|
-
element_names.each do |elementB|
|
417
|
-
if elementA == elementB
|
418
|
-
row << 'NA'
|
419
|
-
else
|
420
|
-
query = relations[elementB]
|
421
|
-
if !query.nil?
|
422
|
-
row << query
|
423
|
-
else
|
424
|
-
row << profiles_similarity[elementB][elementA]
|
425
|
-
end
|
426
|
-
end
|
427
|
-
end
|
428
|
-
matrix << row
|
429
|
-
end
|
430
|
-
matrix[0].unshift('pat')
|
431
|
-
return matrix
|
432
|
-
end
|
433
|
-
|
434
|
-
def format_profiles_similarity_data_pairs(profiles_similarity)
|
435
|
-
pairs = []
|
436
|
-
element_names = profiles_similarity.keys
|
437
|
-
profiles_similarity.each do |elementA, relations|
|
438
|
-
element_names.each do |elementB|
|
439
|
-
if elementA != elementB
|
440
|
-
pair = [elementA, elementB]
|
441
|
-
query = relations[elementB]
|
442
|
-
if !query.nil?
|
443
|
-
pair << query
|
444
|
-
else
|
445
|
-
pair << profiles_similarity[elementB][elementA]
|
446
|
-
end
|
447
|
-
pairs << pair
|
156
|
+
def get_top_dummy_clusters_stats(top_clust_phen)
|
157
|
+
new_cluster_phenotypes = {}
|
158
|
+
top_clust_phen.each_with_index do |cluster, clusterID|
|
159
|
+
phenotypes_frequency = Hash.new(0)
|
160
|
+
total_patients = cluster.length
|
161
|
+
cluster.each do |phenotypes|
|
162
|
+
phenotypes.each do |p|
|
163
|
+
phenotypes_frequency[p] += 1
|
448
164
|
end
|
449
165
|
end
|
450
|
-
|
451
|
-
|
452
|
-
|
453
|
-
|
454
|
-
|
455
|
-
|
456
|
-
|
457
|
-
|
458
|
-
|
459
|
-
|
460
|
-
|
461
|
-
|
462
|
-
|
463
|
-
|
464
|
-
|
465
|
-
|
466
|
-
|
467
|
-
|
468
|
-
|
469
|
-
|
470
|
-
|
471
|
-
|
472
|
-
|
473
|
-
|
474
|
-
|
475
|
-
|
476
|
-
|
477
|
-
|
478
|
-
|
479
|
-
|
480
|
-
|
481
|
-
|
482
|
-
|
483
|
-
|
484
|
-
|
485
|
-
|
486
|
-
f.puts "#{pairsA}\t#{pairsB}\t#{values}"
|
166
|
+
new_cluster_phenotypes[clusterID] = [total_patients, phenotypes_frequency.keys, phenotypes_frequency.values.map{|v| v.fdiv(total_patients) * 100}]
|
167
|
+
end
|
168
|
+
return new_cluster_phenotypes
|
169
|
+
end
|
170
|
+
|
171
|
+
def remove_nested_entries(nested_hash)
|
172
|
+
empty_root_ids = []
|
173
|
+
nested_hash.each do |root_id, entries|
|
174
|
+
entries.select!{|id, val| yield(id, val)}
|
175
|
+
empty_root_ids << root_id if entries.empty?
|
176
|
+
end
|
177
|
+
empty_root_ids.each{|id| nested_hash.delete(id)}
|
178
|
+
end
|
179
|
+
|
180
|
+
def get_semantic_similarity_clustering(options, patient_data, temp_folder)
|
181
|
+
template = File.open(File.join(REPORT_FOLDER, 'cluster_report.erb')).read
|
182
|
+
hpo = Cohort.get_ontology(Cohort.act_ont)
|
183
|
+
reference_profiles = nil
|
184
|
+
reference_profiles = load_profiles(options[:reference_profiles], hpo) if !options[:reference_profiles].nil?
|
185
|
+
Parallel.each(options[:clustering_methods], in_processes: options[:threads] ) do |method_name|
|
186
|
+
matrix_filename = File.join(temp_folder, "similarity_matrix_#{method_name}.npy")
|
187
|
+
profiles_similarity_filename = File.join(temp_folder, ['profiles_similarity', method_name].join('_').concat('.txt'))
|
188
|
+
clusters_distribution_filename = File.join(temp_folder, ['clusters_distribution', method_name].join('_').concat('.txt'))
|
189
|
+
if !File.exists?(matrix_filename)
|
190
|
+
profiles_similarity = patient_data.compare_profiles(sim_type: method_name.to_sym, external_profiles: reference_profiles)
|
191
|
+
remove_nested_entries(profiles_similarity){|id, sim| sim >= options[:sim_thr] } if !options[:sim_thr].nil?
|
192
|
+
write_profile_pairs(profiles_similarity, profiles_similarity_filename)
|
193
|
+
if reference_profiles.nil?
|
194
|
+
axis_file = matrix_filename.gsub('.npy','.lst')
|
195
|
+
similarity_matrix, axis_names = profiles_similarity.to_wmatrix(squared: true, symm: true)
|
196
|
+
similarity_matrix.save(matrix_filename, axis_names, axis_file)
|
197
|
+
else
|
198
|
+
axis_file_x = matrix_filename.gsub('.npy','_x.lst')
|
199
|
+
axis_file_y = matrix_filename.gsub('.npy','_y.lst')
|
200
|
+
similarity_matrix, y_names, x_names = profiles_similarity.to_wmatrix(squared: false, symm: true)
|
201
|
+
similarity_matrix.save(matrix_filename, y_names, axis_file_y, x_names, axis_file_x)
|
487
202
|
end
|
488
203
|
end
|
489
|
-
|
490
|
-
|
491
|
-
|
492
|
-
|
493
|
-
|
494
|
-
|
495
|
-
|
496
|
-
|
497
|
-
|
498
|
-
|
499
|
-
|
500
|
-
|
501
|
-
|
502
|
-
|
503
|
-
|
504
|
-
|
505
|
-
|
506
|
-
|
507
|
-
|
508
|
-
|
509
|
-
|
510
|
-
|
511
|
-
|
512
|
-
|
513
|
-
|
514
|
-
|
515
|
-
|
516
|
-
|
517
|
-
|
518
|
-
|
519
|
-
|
520
|
-
|
521
|
-
|
522
|
-
|
523
|
-
|
524
|
-
end
|
525
|
-
end
|
526
|
-
|
527
|
-
def get_cluster_metadata(clusters_info, output_file)
|
204
|
+
ext_var = ''
|
205
|
+
if method_name == 'resnik'
|
206
|
+
ext_var = '-m max'
|
207
|
+
elsif method_name == 'lin'
|
208
|
+
ext_var = '-m comp1'
|
209
|
+
end
|
210
|
+
cluster_file = "#{method_name}_clusters.txt"
|
211
|
+
if !reference_profiles.nil?
|
212
|
+
ext_var << ' -s'
|
213
|
+
axis_file = "#{axis_file_y},#{axis_file_x}"
|
214
|
+
cluster_file = "#{method_name}_clusters_rows.txt"
|
215
|
+
end
|
216
|
+
out_file = File.join(temp_folder, method_name)
|
217
|
+
system_call(EXTERNAL_CODE, 'plot_heatmap.R', "-y #{axis_file} -d #{matrix_filename} -o #{out_file} -M #{options[:minClusterProportion]} -t dynamic -H #{ext_var}") if !File.exists?(out_file + '_heatmap.png')
|
218
|
+
clusters_codes, clusters_info = parse_clusters_file(File.join(temp_folder, cluster_file), patient_data)
|
219
|
+
write_patient_hpo_stat(get_cluster_metadata(clusters_info), clusters_distribution_filename)
|
220
|
+
out_file = File.join(temp_folder, ['clusters_distribution', method_name].join('_'))
|
221
|
+
system_call(EXTERNAL_CODE, 'xyplot_graph.R', "-d #{clusters_distribution_filename} -o #{out_file} -x PatientsNumber -y HPOAverage") if !File.exists?(out_file)
|
222
|
+
clusters = translate_codes(clusters_codes, hpo)
|
223
|
+
|
224
|
+
container = {
|
225
|
+
:temp_folder => temp_folder,
|
226
|
+
:cluster_name => method_name,
|
227
|
+
:clusters => clusters,
|
228
|
+
:hpo => hpo
|
229
|
+
}
|
230
|
+
|
231
|
+
report = Report_html.new(container, 'Patient clusters report')
|
232
|
+
report.build(template)
|
233
|
+
report.write(options[:output_file]+"_#{method_name}_clusters.html")
|
234
|
+
system_call(EXTERNAL_CODE, 'generate_boxpot.R', "-i #{temp_folder} -m #{method_name} -o #{File.join(temp_folder, method_name + '_sim_boxplot')}") if !File.exists?(File.join(temp_folder, 'sim_boxplot.png'))
|
235
|
+
end
|
236
|
+
end
|
237
|
+
|
238
|
+
def get_cluster_metadata(clusters_info)
|
528
239
|
average_hp_per_pat_distribution = []
|
529
|
-
tmp = []
|
530
240
|
clusters_info.each do |cl_id, pat_info|
|
531
241
|
hp_per_pat_in_clust = pat_info.values.map{|a| a.length}
|
532
242
|
hp_per_pat_ave = hp_per_pat_in_clust.sum.fdiv(hp_per_pat_in_clust.length)
|
533
243
|
average_hp_per_pat_distribution << [pat_info.length, hp_per_pat_ave]
|
534
|
-
tmp << hp_per_pat_in_clust
|
535
|
-
end
|
536
|
-
total_clusters = clusters_info.length
|
537
|
-
average_phenotypes_by_cluster = tmp.flatten.sum.fdiv(total_clusters)
|
538
|
-
File.open(output_file, 'w') do |f|
|
539
|
-
f.puts "#{'PatientsNumber'}\t#{'HPOAverage'}"
|
540
|
-
average_hp_per_pat_distribution.each do |patient_num, ave|
|
541
|
-
f.puts "#{patient_num}\t#{ave}"
|
542
|
-
end
|
543
244
|
end
|
245
|
+
return average_hp_per_pat_distribution
|
544
246
|
end
|