pets 0.2.3 → 0.2.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile +2 -0
- data/README.md +79 -5
- data/bin/coPatReporter.rb +63 -156
- data/bin/comPatMondo.rb +1 -4
- data/bin/evidence_profiler.rb +38 -151
- data/bin/get_network_nodes.rb +79 -132
- data/bin/get_sorted_profs.rb +25 -36
- data/bin/install_deps.rb +7 -0
- data/bin/paco_translator.rb +29 -72
- data/bin/phen2reg.rb +1 -4
- data/bin/profiles2phenopacket.rb +110 -0
- data/bin/reg2phen.rb +1 -3
- data/example_datasets/associations_file.txt +757 -0
- data/example_datasets/example_patient.txt +6 -0
- data/example_datasets/example_patient_hpos.txt +15 -0
- data/example_datasets/genes.txt +8 -0
- data/example_datasets/hpo2ci.txt +2798 -0
- data/example_datasets/hummu_congenital_full_dataset.txt +4183 -0
- data/example_datasets/launch.sh +20 -0
- data/external_code/generate_boxpot.R +51 -21
- data/external_code/get_clusters.R +2 -2
- data/external_code/install_R_dependencies.R +11 -0
- data/external_code/plot_heatmap.R +34 -30
- data/lib/pets/coPatReporterMethods.rb +143 -441
- data/lib/pets/cohort.rb +307 -0
- data/lib/pets/constants.rb +7 -0
- data/lib/pets/generalMethods.rb +8 -317
- data/lib/pets/genomic_features.rb +144 -0
- data/lib/pets/io.rb +457 -0
- data/lib/pets/parsers/cohort_parser.rb +106 -0
- data/lib/pets/version.rb +1 -1
- data/lib/pets.rb +8 -0
- data/pets.gemspec +1 -0
- data/templates/cohort_report.erb +5 -7
- data/templates/patient_report.erb +1 -1
- metadata +34 -3
@@ -1,130 +1,20 @@
|
|
1
|
-
require '
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
hpo = Ontology.new
|
19
|
-
hpo.read(hpo_file)
|
20
|
-
if !excluded_hpo_file.nil?
|
21
|
-
hpo.add_removable_terms(read_excluded_hpo_file(excluded_hpo_file))
|
22
|
-
hpo.remove_removable()
|
23
|
-
hpo.build_index()
|
24
|
-
end
|
25
|
-
end
|
26
|
-
return hpo
|
27
|
-
end
|
28
|
-
|
29
|
-
def format_patient_data(patient_data, options, hpo)
|
30
|
-
rejected_hpos = []
|
31
|
-
rejected_patients = []
|
32
|
-
patient_data.each do |pat_id, patient_record|
|
33
|
-
hpos, chr, start, stop = patient_record
|
34
|
-
|
35
|
-
if options[:hpo_names]
|
36
|
-
hpos, pat_rejected_hpos = hpo.translate_names(hpos)
|
37
|
-
if !pat_rejected_hpos.empty?
|
38
|
-
STDERR.puts "WARNING: patient #{pat_id} has the unknown hpo NAMES '#{pat_rejected_hpos.join(',')}'. Rejected."
|
39
|
-
rejected_hpos.concat(pat_rejected_hpos)
|
40
|
-
end
|
41
|
-
end
|
42
|
-
|
43
|
-
hpos, pat_rejected_hpos = hpo.check_ids(hpos.map{|a| a.to_sym})
|
44
|
-
if !pat_rejected_hpos.empty?
|
45
|
-
STDERR.puts "WARNING: patient #{pat_id} has the unknown hpo CODES '#{pat_rejected_hpos.join(',')}'. Rejected."
|
46
|
-
rejected_hpos.concat(pat_rejected_hpos)
|
47
|
-
end
|
48
|
-
if hpos.empty?
|
49
|
-
rejected_patients << pat_id
|
50
|
-
else
|
51
|
-
patient_record[HPOS] = hpos
|
52
|
-
end
|
53
|
-
end
|
54
|
-
return rejected_hpos.uniq, rejected_patients
|
55
|
-
end
|
56
|
-
|
57
|
-
def compute_hpo_list_and_childs(patient_data, hpo)
|
58
|
-
all_hpo = []
|
59
|
-
suggested_childs = {}
|
60
|
-
total_terms = 0
|
61
|
-
terms_with_more_specific_childs = 0
|
62
|
-
patient_data.each do |pat_id, hpos|
|
63
|
-
total_terms += hpos.length
|
64
|
-
more_specific_childs = hpo.get_childs_table(hpos, true)
|
65
|
-
terms_with_more_specific_childs += more_specific_childs.select{|hpo_record| !hpo_record.last.empty?}.length #Exclude phenotypes with no childs
|
66
|
-
suggested_childs[pat_id] = more_specific_childs
|
67
|
-
all_hpo.concat(hpos)
|
68
|
-
end
|
69
|
-
return all_hpo.uniq, suggested_childs, terms_with_more_specific_childs.fdiv(total_terms)
|
70
|
-
end
|
71
|
-
|
72
|
-
def clean_patient_profiles(hpo, patient_profiles)
|
73
|
-
rejected_patients = []
|
74
|
-
patient_profiles.each do |pat, prof|
|
75
|
-
phens = hpo.clean_profile_hard(prof)
|
76
|
-
if phens.empty?
|
77
|
-
rejected_patients << pat
|
78
|
-
else
|
79
|
-
patient_profiles[pat] = phens
|
80
|
-
end
|
81
|
-
end
|
82
|
-
patient_profiles.select!{|pat_id, patient_record| !rejected_patients.include?(pat_id)}
|
83
|
-
hpo.profiles = {}
|
84
|
-
hpo.load_profiles(patient_profiles)
|
85
|
-
|
86
|
-
end
|
87
|
-
|
88
|
-
def generate_patient_hpo_matrix(patient_data, cohort_hpos)
|
89
|
-
matrix = []
|
90
|
-
n = cohort_hpos.length
|
91
|
-
patient_data.each do |pat_id, pat_hpos|
|
92
|
-
vector = Array.new(n, 0)
|
93
|
-
pat_hpos.each do |hpo|
|
94
|
-
vector[cohort_hpos.index(hpo)] = 1
|
95
|
-
end
|
96
|
-
matrix << vector
|
97
|
-
end
|
98
|
-
return matrix
|
99
|
-
end
|
100
|
-
|
101
|
-
def generate_patient_hpo_matrix_numo(patient_data, cohort_hpos)
|
102
|
-
y_names = patient_data.keys
|
103
|
-
x_names = cohort_hpos
|
104
|
-
x_names_indx = {}
|
105
|
-
cohort_hpos.each_with_index{|hp,x| x_names_indx[hp]=x}
|
106
|
-
# row (y), cols (x)
|
107
|
-
matrix = Numo::DFloat.zeros(patient_data.length, cohort_hpos.length)
|
108
|
-
i = 0
|
109
|
-
patient_data.each do |pat_id, pat_hpos|
|
110
|
-
pat_hpos.each do |hp|
|
111
|
-
matrix[i, x_names_indx[hp]] = 1
|
112
|
-
end
|
113
|
-
i += 1
|
114
|
-
end
|
115
|
-
return matrix, y_names, x_names
|
116
|
-
end
|
117
|
-
|
118
|
-
def write_matrix_for_R(matrix, x_names, y_names, file)
|
119
|
-
File.open(file, 'w') do |f|
|
120
|
-
f.puts x_names.join("\t")
|
121
|
-
matrix.each_with_index do |row, i|
|
122
|
-
f.puts [y_names[i]].concat(row).join("\t")
|
123
|
-
end
|
124
|
-
end
|
125
|
-
end
|
126
|
-
|
127
|
-
def process_clustered_patients(options, clustered_patients, patient_uniq_profiles, patient_data, equivalence, hpo, phenotype_ic, patient_id_type) # get ic and chromosomes
|
1
|
+
require 'expcalc'
|
2
|
+
def translate_codes(clusters, hpo)
|
3
|
+
translated_clusters = []
|
4
|
+
clusters.each do |clusterID, num_of_pats, patientIDs_ary, patient_hpos_ary|
|
5
|
+
translate_codes = patient_hpos_ary.map{|patient_hpos| patient_hpos.map{|hpo_code| hpo.translate_id(hpo_code)}}
|
6
|
+
translated_clusters << [clusterID,
|
7
|
+
num_of_pats,
|
8
|
+
patientIDs_ary,
|
9
|
+
patient_hpos_ary,
|
10
|
+
translate_codes
|
11
|
+
]
|
12
|
+
end
|
13
|
+
return translated_clusters
|
14
|
+
end
|
15
|
+
|
16
|
+
def process_dummy_clustered_patients(options, clustered_patients, patient_data, phenotype_ic) # get ic and chromosomes
|
17
|
+
ont = Cohort.get_ontology(Cohort.act_ont)
|
128
18
|
all_ics = []
|
129
19
|
all_lengths = []
|
130
20
|
top_cluster_phenotypes = []
|
@@ -132,30 +22,9 @@ def process_clustered_patients(options, clustered_patients, patient_uniq_profile
|
|
132
22
|
multi_chromosome_patients = 0
|
133
23
|
processed_clusters = 0
|
134
24
|
clustered_patients.sort_by{|cl_id, pat_ids| pat_ids.length }.reverse.each do |cluster_id, patient_ids|
|
135
|
-
|
136
|
-
|
137
|
-
all_phens =
|
138
|
-
profile_ics = []
|
139
|
-
profile_lengths = []
|
140
|
-
processed_patients = []
|
141
|
-
patient_ids.each do |pat_id|
|
142
|
-
phenotypes = patient_uniq_profiles[pat_id]
|
143
|
-
#pat_id = pat_id.gsub(/_i\d+$/,'') if patient_id_type != 'generated'
|
144
|
-
processed_patients << pat_id
|
145
|
-
profile_ics << get_profile_ic(phenotypes, phenotype_ic)
|
146
|
-
profile_lengths << phenotypes.length
|
147
|
-
if processed_clusters < options[:clusters2show_detailed_phen_data]
|
148
|
-
phen_names, rejected_codes = hpo.translate_ids(phenotypes) #optional
|
149
|
-
all_phens << phen_names
|
150
|
-
end
|
151
|
-
variants = equivalence[pat_id]
|
152
|
-
variants.each do |variant|
|
153
|
-
variant_data = patient_data[variant]
|
154
|
-
chrs[variant_data[CHR]] += 1 if !options[:chromosome_col].nil? && variant_data[CHR] != '-'
|
155
|
-
end
|
156
|
-
end
|
157
|
-
num_of_patients = processed_patients.length
|
158
|
-
next if num_of_patients == 1 # Check that current cluster only has one patient with several mutations
|
25
|
+
num_of_patients = patient_ids.length
|
26
|
+
next if num_of_patients == 1
|
27
|
+
chrs, all_phens, profile_ics, profile_lengths = process_cluster(patient_ids, patient_data, phenotype_ic, options, ont, processed_clusters)
|
159
28
|
top_cluster_phenotypes << all_phens if processed_clusters < options[:clusters2show_detailed_phen_data]
|
160
29
|
all_ics << profile_ics
|
161
30
|
all_lengths << profile_lengths
|
@@ -170,12 +39,29 @@ def process_clustered_patients(options, clustered_patients, patient_uniq_profile
|
|
170
39
|
return all_ics, all_lengths, cluster_data_by_chromosomes, top_cluster_phenotypes, multi_chromosome_patients
|
171
40
|
end
|
172
41
|
|
42
|
+
def process_cluster(patient_ids, patient_data, phenotype_ic, options, ont, processed_clusters)
|
43
|
+
chrs = Hash.new(0)
|
44
|
+
all_phens = []
|
45
|
+
profile_ics = []
|
46
|
+
profile_lengths = []
|
47
|
+
patient_ids.each do |pat_id|
|
48
|
+
phenotypes = patient_data.get_profile(pat_id)
|
49
|
+
profile_ics << get_profile_ic(phenotypes, phenotype_ic)
|
50
|
+
profile_lengths << phenotypes.length
|
51
|
+
if processed_clusters < options[:clusters2show_detailed_phen_data]
|
52
|
+
phen_names, rejected_codes = ont.translate_ids(phenotypes) #optional
|
53
|
+
all_phens << phen_names
|
54
|
+
end
|
55
|
+
patient_data.get_vars(pat_id).get_chr.each{|chr| chrs[chr] += 1} if !options[:chromosome_col].nil?
|
56
|
+
end
|
57
|
+
return chrs, all_phens, profile_ics, profile_lengths
|
58
|
+
end
|
59
|
+
|
173
60
|
def get_profile_ic(hpo_names, phenotype_ic)
|
174
61
|
ic = 0
|
175
62
|
profile_length = 0
|
176
63
|
hpo_names.each do |hpo_id|
|
177
64
|
hpo_ic = phenotype_ic[hpo_id]
|
178
|
-
# STDERR.puts phenotype_ic.inspect
|
179
65
|
ic += hpo_ic if !hpo_ic.nil?
|
180
66
|
profile_length += 1
|
181
67
|
end
|
@@ -183,154 +69,42 @@ def get_profile_ic(hpo_names, phenotype_ic)
|
|
183
69
|
return ic.fdiv(profile_length)
|
184
70
|
end
|
185
71
|
|
186
|
-
def
|
187
|
-
File.open(cluster_ic_data_file, 'w') do |f|
|
188
|
-
f.puts %w[cluster_id ic Plen].join("\t")
|
189
|
-
all_ics.each_with_index do |cluster_ics, i|
|
190
|
-
break if i == limit
|
191
|
-
cluster_length = cluster_ics.length
|
192
|
-
cluster_ics.each_with_index do |clust_ic, j|
|
193
|
-
f.puts "#{cluster_length}_#{i}\t#{clust_ic}\t#{profile_lengths[i][j]}"
|
194
|
-
end
|
195
|
-
end
|
196
|
-
end
|
197
|
-
end
|
198
|
-
|
199
|
-
def write_cluster_chromosome_data(cluster_data, cluster_chromosome_data_file, limit)
|
200
|
-
File.open(cluster_chromosome_data_file, 'w') do |f|
|
201
|
-
f.puts %w[cluster_id chr count].join("\t")
|
202
|
-
index = 0
|
203
|
-
last_id = cluster_data.first.first unless cluster_data.empty?
|
204
|
-
cluster_data.each do |cluster_id, patient_number, chr, count|
|
205
|
-
index += 1 if cluster_id != last_id
|
206
|
-
break if index == limit
|
207
|
-
f.puts ["#{patient_number}_#{index}", chr, count].join("\t")
|
208
|
-
last_id = cluster_id
|
209
|
-
end
|
210
|
-
end
|
211
|
-
end
|
212
|
-
|
213
|
-
def write_coverage_data(coverage_to_plot, coverage_to_plot_file)
|
214
|
-
File.open(coverage_to_plot_file, 'w') do |f|
|
215
|
-
coverage_to_plot.each do |chr, position, freq|
|
216
|
-
f.puts "#{chr}\t#{position}\t#{freq}"
|
217
|
-
end
|
218
|
-
end
|
219
|
-
end
|
220
|
-
|
221
|
-
def get_uniq_hpo_profiles(patient_data) # To avoid duplications due to more one mutation in the same patient
|
222
|
-
hpo_profiles = {}
|
223
|
-
equivalence = {}
|
224
|
-
patient_data.each do |variant_id, patient_rec|
|
225
|
-
pat_id, count = variant_id.split('_i')
|
226
|
-
hpo_profiles[pat_id] = patient_rec[HPOS]
|
227
|
-
query = equivalence[pat_id]
|
228
|
-
if query.nil?
|
229
|
-
equivalence[pat_id] = [variant_id]
|
230
|
-
else
|
231
|
-
query << variant_id
|
232
|
-
end
|
233
|
-
end
|
234
|
-
return hpo_profiles, equivalence
|
235
|
-
end
|
236
|
-
|
237
|
-
def get_patient_ids(patient_data) # To aviod duplications due to more one mutation in the same patient
|
238
|
-
ids = []
|
239
|
-
patient_data.each do |pat_id, hpos|
|
240
|
-
id, count = pat_id.split('_i')
|
241
|
-
ids << id
|
242
|
-
end
|
243
|
-
return ids.uniq
|
244
|
-
end
|
245
|
-
|
246
|
-
def get_summary_stats(patient_data, rejected_patients, cohort_hpos, hpo)
|
72
|
+
def get_summary_stats(patient_data, rejected_patients, hpo_stats, fraction_terms_specific_childs, rejected_hpos)
|
247
73
|
stats = []
|
248
|
-
stats << ['Unique HPO terms',
|
249
|
-
stats << ['Cohort size',
|
74
|
+
stats << ['Unique HPO terms', hpo_stats.length]
|
75
|
+
stats << ['Cohort size', patient_data.profiles.length]
|
250
76
|
stats << ['Rejected patients by empty profile', rejected_patients.length]
|
251
|
-
|
252
|
-
stats << ['
|
253
|
-
stats << ['HPO
|
77
|
+
stats << ['HPOs per patient (average)', patient_data.get_profiles_mean_size]
|
78
|
+
stats << ['HPO terms per patient: percentile 90', patient_data.get_profile_length_at_percentile(perc=90)]
|
79
|
+
stats << ['Percentage of HPO with more specific children', (fraction_terms_specific_childs * 100).round(4)]
|
80
|
+
stats << ['DsI for uniq HP terms', patient_data.get_dataset_specifity_index('uniq')]
|
81
|
+
stats << ['DsI for frequency weigthed HP terms', patient_data.get_dataset_specifity_index('weigthed')]
|
82
|
+
stats << ['Number of unknown phenotypes', rejected_hpos.length]
|
254
83
|
return stats
|
255
84
|
end
|
256
85
|
|
257
|
-
def
|
86
|
+
def dummy_cluster_patients(patient_data, matrix_file, clust_pat_file)
|
258
87
|
if !File.exists?(matrix_file)
|
259
|
-
pat_hpo_matrix, pat_id, hp_id =
|
88
|
+
pat_hpo_matrix, pat_id, hp_id = patient_data.to_bmatrix
|
260
89
|
x_axis_file = matrix_file.gsub('.npy','_x.lst')
|
261
|
-
File.open(x_axis_file, 'w'){|f| f.print hp_id.join("\n") }
|
262
90
|
y_axis_file = matrix_file.gsub('.npy','_y.lst')
|
263
|
-
|
264
|
-
Npy.save(matrix_file, pat_hpo_matrix)
|
91
|
+
pat_hpo_matrix.save(matrix_file, hp_id, x_axis_file, pat_id, y_axis_file)
|
265
92
|
end
|
266
|
-
|
267
|
-
clustered_patients = load_clustered_patients(
|
93
|
+
system_call(EXTERNAL_CODE, 'get_clusters.R', "-d #{matrix_file} -o #{clust_pat_file} -y #{matrix_file.gsub('.npy','')}") if !File.exists?(clust_pat_file)
|
94
|
+
clustered_patients = load_clustered_patients(clust_pat_file)
|
268
95
|
return(clustered_patients)
|
269
96
|
end
|
270
97
|
|
271
|
-
def
|
272
|
-
|
273
|
-
|
274
|
-
|
275
|
-
|
276
|
-
|
277
|
-
|
278
|
-
|
279
|
-
def write_detailed_hpo_profile_evaluation(suggested_childs, detailed_profile_evaluation_file, summary_stats)
|
280
|
-
CSV.open(detailed_profile_evaluation_file, "wb") do |csv|
|
281
|
-
suggested_childs.each do |pat_id, suggestions|
|
282
|
-
warning = nil
|
283
|
-
warning = 'WARNING: Very few phenotypes' if suggestions.length < 4
|
284
|
-
csv << ["PATIENT #{pat_id}", "#{warning}"]
|
285
|
-
csv << ["CURRENT PHENOTYPES", "PUTATIVE MORE SPECIFIC PHENOTYPES"]
|
286
|
-
suggestions.each do |parent, childs|
|
287
|
-
parent_code, parent_name = parent
|
288
|
-
if childs.empty?
|
289
|
-
csv << ["#{parent_name} (#{parent_code})", '-']
|
290
|
-
else
|
291
|
-
parent_writed = false
|
292
|
-
childs.each do |child_code, child_name|
|
293
|
-
if !parent_writed
|
294
|
-
parent_field = "#{parent_name} (#{parent_code})"
|
295
|
-
parent_writed = true
|
296
|
-
else
|
297
|
-
parent_field = ""
|
298
|
-
end
|
299
|
-
csv << [parent_field, "#{child_name} (#{child_code})"]
|
300
|
-
end
|
301
|
-
end
|
302
|
-
end
|
303
|
-
csv << ["", ""]
|
98
|
+
def get_mean_size(all_sizes)
|
99
|
+
accumulated_size = 0
|
100
|
+
number = 0
|
101
|
+
all_sizes.each do |size, occurrences|
|
102
|
+
accumulated_size += size *occurrences
|
103
|
+
number += occurrences
|
304
104
|
end
|
305
|
-
|
105
|
+
return accumulated_size.fdiv(number)
|
306
106
|
end
|
307
107
|
|
308
|
-
def write_arrays4scatterplot(x_axis_value, y_axis_value, filename, x_axis_name, y_axis_name)
|
309
|
-
File.open(filename, 'w') do |f|
|
310
|
-
f.puts "#{x_axis_name}\t#{y_axis_name}"
|
311
|
-
x_axis_value.each_with_index do |value,i|
|
312
|
-
y_value = y_axis_value[i]
|
313
|
-
raise("The #{i} position is not presented in y_axis_value") if y_value.nil?
|
314
|
-
f.puts [value, y_value].join("\t")
|
315
|
-
end
|
316
|
-
end
|
317
|
-
end
|
318
|
-
|
319
|
-
def process_patient_data(patient_data)
|
320
|
-
parsed_patient_data = {}
|
321
|
-
patient_data.each do |patientID, metadata|
|
322
|
-
phenotypes, chr, start, stop = metadata
|
323
|
-
next if chr == '-'
|
324
|
-
info = [patientID, start.to_i, stop.to_i]
|
325
|
-
query = parsed_patient_data[chr]
|
326
|
-
if query.nil?
|
327
|
-
parsed_patient_data[chr] = [info]
|
328
|
-
else
|
329
|
-
query << info
|
330
|
-
end
|
331
|
-
end
|
332
|
-
return parsed_patient_data
|
333
|
-
end
|
334
108
|
|
335
109
|
def get_final_coverage(raw_coverage, bin_size)
|
336
110
|
coverage_to_plot = []
|
@@ -360,185 +134,113 @@ def get_sor_length_distribution(raw_coverage)
|
|
360
134
|
return all_cnvs_length
|
361
135
|
end
|
362
136
|
|
363
|
-
def get_cnvs_length(patient_data)
|
364
|
-
length_stats = Hash.new(0)
|
365
|
-
patient_data.each do |pat_id, patient_record|
|
366
|
-
string_hpos, chr, start, stop = patient_record
|
367
|
-
length_stats[stop - start] += 1
|
368
|
-
end
|
369
|
-
return length_stats.to_a.sort!{|stat| stat[1] <=> stat[1] }
|
370
|
-
end
|
371
|
-
|
372
|
-
|
373
137
|
def calculate_coverage(regions_data, delete_thresold = 0)
|
374
138
|
raw_coverage = {}
|
375
139
|
n_regions = 0
|
376
140
|
patients = 0
|
377
141
|
nt = 0
|
378
|
-
regions_data.each do |start, stop, chr,
|
379
|
-
number_of_patients =
|
142
|
+
regions_data.each do |start, stop, chr, reg_id|
|
143
|
+
number_of_patients = reg_id.split('.').last.to_i
|
380
144
|
if number_of_patients <= delete_thresold
|
381
145
|
number_of_patients = 0
|
382
146
|
else
|
383
147
|
n_regions += 1
|
384
|
-
patients += number_of_patients
|
385
148
|
nt += stop - start
|
386
149
|
end
|
387
|
-
|
388
|
-
|
389
|
-
if query.nil?
|
390
|
-
raw_coverage[chr] = [coords]
|
391
|
-
else
|
392
|
-
query << coords
|
393
|
-
end
|
150
|
+
add_record(raw_coverage, chr, [start, stop, number_of_patients])
|
151
|
+
patients += number_of_patients
|
394
152
|
end
|
395
153
|
return raw_coverage, n_regions, nt, patients.fdiv(n_regions)
|
396
154
|
end
|
397
155
|
|
398
|
-
def
|
399
|
-
|
400
|
-
|
401
|
-
|
402
|
-
|
403
|
-
|
404
|
-
|
405
|
-
|
406
|
-
profile_sizes, parental_hpos_per_profile = profile_sizes.zip(parental_hpos_per_profile).sort_by{|i| i.first}.reverse.transpose
|
407
|
-
return profile_sizes, parental_hpos_per_profile
|
408
|
-
end
|
409
|
-
|
410
|
-
def format_profiles_similarity_data(profiles_similarity)
|
411
|
-
matrix = []
|
412
|
-
element_names = profiles_similarity.keys
|
413
|
-
matrix << element_names
|
414
|
-
profiles_similarity.each do |elementA, relations|
|
415
|
-
row = [elementA]
|
416
|
-
element_names.each do |elementB|
|
417
|
-
if elementA == elementB
|
418
|
-
row << 'NA'
|
419
|
-
else
|
420
|
-
query = relations[elementB]
|
421
|
-
if !query.nil?
|
422
|
-
row << query
|
423
|
-
else
|
424
|
-
row << profiles_similarity[elementB][elementA]
|
425
|
-
end
|
426
|
-
end
|
427
|
-
end
|
428
|
-
matrix << row
|
429
|
-
end
|
430
|
-
matrix[0].unshift('pat')
|
431
|
-
return matrix
|
432
|
-
end
|
433
|
-
|
434
|
-
def format_profiles_similarity_data_pairs(profiles_similarity)
|
435
|
-
pairs = []
|
436
|
-
element_names = profiles_similarity.keys
|
437
|
-
profiles_similarity.each do |elementA, relations|
|
438
|
-
element_names.each do |elementB|
|
439
|
-
if elementA != elementB
|
440
|
-
pair = [elementA, elementB]
|
441
|
-
query = relations[elementB]
|
442
|
-
if !query.nil?
|
443
|
-
pair << query
|
444
|
-
else
|
445
|
-
pair << profiles_similarity[elementB][elementA]
|
446
|
-
end
|
447
|
-
pairs << pair
|
156
|
+
def get_top_dummy_clusters_stats(top_clust_phen)
|
157
|
+
new_cluster_phenotypes = {}
|
158
|
+
top_clust_phen.each_with_index do |cluster, clusterID|
|
159
|
+
phenotypes_frequency = Hash.new(0)
|
160
|
+
total_patients = cluster.length
|
161
|
+
cluster.each do |phenotypes|
|
162
|
+
phenotypes.each do |p|
|
163
|
+
phenotypes_frequency[p] += 1
|
448
164
|
end
|
449
165
|
end
|
450
|
-
|
451
|
-
|
452
|
-
|
453
|
-
|
454
|
-
|
455
|
-
|
456
|
-
|
457
|
-
|
458
|
-
|
459
|
-
|
460
|
-
|
461
|
-
|
462
|
-
|
463
|
-
|
464
|
-
|
465
|
-
|
466
|
-
|
467
|
-
|
468
|
-
|
469
|
-
|
470
|
-
|
471
|
-
|
472
|
-
|
473
|
-
|
474
|
-
|
475
|
-
|
476
|
-
|
477
|
-
|
478
|
-
|
479
|
-
|
480
|
-
|
481
|
-
|
482
|
-
|
483
|
-
|
484
|
-
|
485
|
-
|
486
|
-
f.puts "#{pairsA}\t#{pairsB}\t#{values}"
|
166
|
+
new_cluster_phenotypes[clusterID] = [total_patients, phenotypes_frequency.keys, phenotypes_frequency.values.map{|v| v.fdiv(total_patients) * 100}]
|
167
|
+
end
|
168
|
+
return new_cluster_phenotypes
|
169
|
+
end
|
170
|
+
|
171
|
+
def remove_nested_entries(nested_hash)
|
172
|
+
empty_root_ids = []
|
173
|
+
nested_hash.each do |root_id, entries|
|
174
|
+
entries.select!{|id, val| yield(id, val)}
|
175
|
+
empty_root_ids << root_id if entries.empty?
|
176
|
+
end
|
177
|
+
empty_root_ids.each{|id| nested_hash.delete(id)}
|
178
|
+
end
|
179
|
+
|
180
|
+
def get_semantic_similarity_clustering(options, patient_data, temp_folder)
|
181
|
+
template = File.open(File.join(REPORT_FOLDER, 'cluster_report.erb')).read
|
182
|
+
hpo = Cohort.get_ontology(Cohort.act_ont)
|
183
|
+
reference_profiles = nil
|
184
|
+
reference_profiles = load_profiles(options[:reference_profiles], hpo) if !options[:reference_profiles].nil?
|
185
|
+
Parallel.each(options[:clustering_methods], in_processes: options[:threads] ) do |method_name|
|
186
|
+
matrix_filename = File.join(temp_folder, "similarity_matrix_#{method_name}.npy")
|
187
|
+
profiles_similarity_filename = File.join(temp_folder, ['profiles_similarity', method_name].join('_').concat('.txt'))
|
188
|
+
clusters_distribution_filename = File.join(temp_folder, ['clusters_distribution', method_name].join('_').concat('.txt'))
|
189
|
+
if !File.exists?(matrix_filename)
|
190
|
+
profiles_similarity = patient_data.compare_profiles(sim_type: method_name.to_sym, external_profiles: reference_profiles)
|
191
|
+
remove_nested_entries(profiles_similarity){|id, sim| sim >= options[:sim_thr] } if !options[:sim_thr].nil?
|
192
|
+
write_profile_pairs(profiles_similarity, profiles_similarity_filename)
|
193
|
+
if reference_profiles.nil?
|
194
|
+
axis_file = matrix_filename.gsub('.npy','.lst')
|
195
|
+
similarity_matrix, axis_names = profiles_similarity.to_wmatrix(squared: true, symm: true)
|
196
|
+
similarity_matrix.save(matrix_filename, axis_names, axis_file)
|
197
|
+
else
|
198
|
+
axis_file_x = matrix_filename.gsub('.npy','_x.lst')
|
199
|
+
axis_file_y = matrix_filename.gsub('.npy','_y.lst')
|
200
|
+
similarity_matrix, y_names, x_names = profiles_similarity.to_wmatrix(squared: false, symm: true)
|
201
|
+
similarity_matrix.save(matrix_filename, y_names, axis_file_y, x_names, axis_file_x)
|
487
202
|
end
|
488
203
|
end
|
489
|
-
|
490
|
-
|
491
|
-
|
492
|
-
|
493
|
-
|
494
|
-
|
495
|
-
|
496
|
-
|
497
|
-
|
498
|
-
|
499
|
-
|
500
|
-
|
501
|
-
|
502
|
-
|
503
|
-
|
504
|
-
|
505
|
-
|
506
|
-
|
507
|
-
|
508
|
-
|
509
|
-
|
510
|
-
|
511
|
-
|
512
|
-
|
513
|
-
|
514
|
-
|
515
|
-
|
516
|
-
|
517
|
-
|
518
|
-
|
519
|
-
|
520
|
-
|
521
|
-
|
522
|
-
|
523
|
-
|
524
|
-
end
|
525
|
-
end
|
526
|
-
|
527
|
-
def get_cluster_metadata(clusters_info, output_file)
|
204
|
+
ext_var = ''
|
205
|
+
if method_name == 'resnik'
|
206
|
+
ext_var = '-m max'
|
207
|
+
elsif method_name == 'lin'
|
208
|
+
ext_var = '-m comp1'
|
209
|
+
end
|
210
|
+
cluster_file = "#{method_name}_clusters.txt"
|
211
|
+
if !reference_profiles.nil?
|
212
|
+
ext_var << ' -s'
|
213
|
+
axis_file = "#{axis_file_y},#{axis_file_x}"
|
214
|
+
cluster_file = "#{method_name}_clusters_rows.txt"
|
215
|
+
end
|
216
|
+
out_file = File.join(temp_folder, method_name)
|
217
|
+
system_call(EXTERNAL_CODE, 'plot_heatmap.R', "-y #{axis_file} -d #{matrix_filename} -o #{out_file} -M #{options[:minClusterProportion]} -t dynamic -H #{ext_var}") if !File.exists?(out_file + '_heatmap.png')
|
218
|
+
clusters_codes, clusters_info = parse_clusters_file(File.join(temp_folder, cluster_file), patient_data)
|
219
|
+
write_patient_hpo_stat(get_cluster_metadata(clusters_info), clusters_distribution_filename)
|
220
|
+
out_file = File.join(temp_folder, ['clusters_distribution', method_name].join('_'))
|
221
|
+
system_call(EXTERNAL_CODE, 'xyplot_graph.R', "-d #{clusters_distribution_filename} -o #{out_file} -x PatientsNumber -y HPOAverage") if !File.exists?(out_file)
|
222
|
+
clusters = translate_codes(clusters_codes, hpo)
|
223
|
+
|
224
|
+
container = {
|
225
|
+
:temp_folder => temp_folder,
|
226
|
+
:cluster_name => method_name,
|
227
|
+
:clusters => clusters,
|
228
|
+
:hpo => hpo
|
229
|
+
}
|
230
|
+
|
231
|
+
report = Report_html.new(container, 'Patient clusters report')
|
232
|
+
report.build(template)
|
233
|
+
report.write(options[:output_file]+"_#{method_name}_clusters.html")
|
234
|
+
system_call(EXTERNAL_CODE, 'generate_boxpot.R', "-i #{temp_folder} -m #{method_name} -o #{File.join(temp_folder, method_name + '_sim_boxplot')}") if !File.exists?(File.join(temp_folder, 'sim_boxplot.png'))
|
235
|
+
end
|
236
|
+
end
|
237
|
+
|
238
|
+
def get_cluster_metadata(clusters_info)
|
528
239
|
average_hp_per_pat_distribution = []
|
529
|
-
tmp = []
|
530
240
|
clusters_info.each do |cl_id, pat_info|
|
531
241
|
hp_per_pat_in_clust = pat_info.values.map{|a| a.length}
|
532
242
|
hp_per_pat_ave = hp_per_pat_in_clust.sum.fdiv(hp_per_pat_in_clust.length)
|
533
243
|
average_hp_per_pat_distribution << [pat_info.length, hp_per_pat_ave]
|
534
|
-
tmp << hp_per_pat_in_clust
|
535
|
-
end
|
536
|
-
total_clusters = clusters_info.length
|
537
|
-
average_phenotypes_by_cluster = tmp.flatten.sum.fdiv(total_clusters)
|
538
|
-
File.open(output_file, 'w') do |f|
|
539
|
-
f.puts "#{'PatientsNumber'}\t#{'HPOAverage'}"
|
540
|
-
average_hp_per_pat_distribution.each do |patient_num, ave|
|
541
|
-
f.puts "#{patient_num}\t#{ave}"
|
542
|
-
end
|
543
244
|
end
|
245
|
+
return average_hp_per_pat_distribution
|
544
246
|
end
|