pets 0.2.3 → 0.2.5

Sign up to get free protection for your applications and to get access to all the features.
Files changed (42) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile +2 -0
  3. data/README.md +79 -5
  4. data/bin/coPatReporter.rb +68 -156
  5. data/bin/comPatMondo.rb +1 -4
  6. data/bin/evidence_profiler.rb +102 -150
  7. data/bin/get_gen_features.rb +146 -0
  8. data/bin/get_network_nodes.rb +79 -132
  9. data/bin/get_sorted_profs.rb +25 -36
  10. data/bin/install_deps.rb +8 -0
  11. data/bin/paco_translator.rb +29 -72
  12. data/bin/phen2reg.rb +1 -4
  13. data/bin/profiles2phenopacket.rb +86 -0
  14. data/bin/reg2phen.rb +1 -3
  15. data/example_datasets/associations_file.txt +757 -0
  16. data/example_datasets/example_patient.txt +6 -0
  17. data/example_datasets/example_patient_hpos.txt +15 -0
  18. data/example_datasets/genes.txt +8 -0
  19. data/example_datasets/hpo2ci.txt +2798 -0
  20. data/example_datasets/hummu_congenital_full_dataset.txt +4183 -0
  21. data/example_datasets/launch.sh +20 -0
  22. data/external_code/generate_boxpot.R +51 -21
  23. data/external_code/get_clusters.R +2 -2
  24. data/external_code/install_R_dependencies.R +16 -0
  25. data/external_code/plot_heatmap.R +34 -30
  26. data/lib/pets/coPatReporterMethods.rb +172 -424
  27. data/lib/pets/cohort.rb +309 -0
  28. data/lib/pets/common_optparse.rb +30 -0
  29. data/lib/pets/constants.rb +8 -0
  30. data/lib/pets/generalMethods.rb +29 -319
  31. data/lib/pets/genomic_features.rb +240 -0
  32. data/lib/pets/io.rb +481 -0
  33. data/lib/pets/parsers/cohort_parser.rb +111 -0
  34. data/lib/pets/parsers/reference_parser.rb +39 -0
  35. data/lib/pets/version.rb +1 -1
  36. data/lib/pets.rb +9 -0
  37. data/pets.gemspec +7 -3
  38. data/templates/cluster_report.erb +25 -5
  39. data/templates/cohort_report.erb +5 -7
  40. data/templates/evidence_profile.erb +20 -4
  41. data/templates/patient_report.erb +1 -1
  42. metadata +96 -5
@@ -1,130 +1,20 @@
1
- require 'numo/narray'
2
- require 'semtools'
3
-
4
- HPOS = 0
5
- CHR = 1
6
- START = 2
7
- STOP = 3
8
-
9
- def load_hpo_ontology(hpo_file, excluded_hpo_file)
10
- hpo = nil
11
- if !hpo_file.include?('.json')
12
- if !excluded_hpo_file.nil?
13
- hpo = Ontology.new(file: hpo_file, load_file: true, removable_terms: read_excluded_hpo_file(excluded_hpo_file))
14
- else
15
- hpo = Ontology.new(file: hpo_file, load_file: true)
16
- end
17
- else
18
- hpo = Ontology.new
19
- hpo.read(hpo_file)
20
- if !excluded_hpo_file.nil?
21
- hpo.add_removable_terms(read_excluded_hpo_file(excluded_hpo_file))
22
- hpo.remove_removable()
23
- hpo.build_index()
24
- end
25
- end
26
- return hpo
27
- end
28
-
29
- def format_patient_data(patient_data, options, hpo)
30
- rejected_hpos = []
31
- rejected_patients = []
32
- patient_data.each do |pat_id, patient_record|
33
- hpos, chr, start, stop = patient_record
34
-
35
- if options[:hpo_names]
36
- hpos, pat_rejected_hpos = hpo.translate_names(hpos)
37
- if !pat_rejected_hpos.empty?
38
- STDERR.puts "WARNING: patient #{pat_id} has the unknown hpo NAMES '#{pat_rejected_hpos.join(',')}'. Rejected."
39
- rejected_hpos.concat(pat_rejected_hpos)
40
- end
41
- end
42
-
43
- hpos, pat_rejected_hpos = hpo.check_ids(hpos.map{|a| a.to_sym})
44
- if !pat_rejected_hpos.empty?
45
- STDERR.puts "WARNING: patient #{pat_id} has the unknown hpo CODES '#{pat_rejected_hpos.join(',')}'. Rejected."
46
- rejected_hpos.concat(pat_rejected_hpos)
47
- end
48
- if hpos.empty?
49
- rejected_patients << pat_id
50
- else
51
- patient_record[HPOS] = hpos
52
- end
53
- end
54
- return rejected_hpos.uniq, rejected_patients
55
- end
56
-
57
- def compute_hpo_list_and_childs(patient_data, hpo)
58
- all_hpo = []
59
- suggested_childs = {}
60
- total_terms = 0
61
- terms_with_more_specific_childs = 0
62
- patient_data.each do |pat_id, hpos|
63
- total_terms += hpos.length
64
- more_specific_childs = hpo.get_childs_table(hpos, true)
65
- terms_with_more_specific_childs += more_specific_childs.select{|hpo_record| !hpo_record.last.empty?}.length #Exclude phenotypes with no childs
66
- suggested_childs[pat_id] = more_specific_childs
67
- all_hpo.concat(hpos)
68
- end
69
- return all_hpo.uniq, suggested_childs, terms_with_more_specific_childs.fdiv(total_terms)
70
- end
71
-
72
- def clean_patient_profiles(hpo, patient_profiles)
73
- rejected_patients = []
74
- patient_profiles.each do |pat, prof|
75
- phens = hpo.clean_profile_hard(prof)
76
- if phens.empty?
77
- rejected_patients << pat
78
- else
79
- patient_profiles[pat] = phens
80
- end
81
- end
82
- patient_profiles.select!{|pat_id, patient_record| !rejected_patients.include?(pat_id)}
83
- hpo.profiles = {}
84
- hpo.load_profiles(patient_profiles)
85
-
86
- end
87
-
88
- def generate_patient_hpo_matrix(patient_data, cohort_hpos)
89
- matrix = []
90
- n = cohort_hpos.length
91
- patient_data.each do |pat_id, pat_hpos|
92
- vector = Array.new(n, 0)
93
- pat_hpos.each do |hpo|
94
- vector[cohort_hpos.index(hpo)] = 1
95
- end
96
- matrix << vector
97
- end
98
- return matrix
99
- end
100
-
101
- def generate_patient_hpo_matrix_numo(patient_data, cohort_hpos)
102
- y_names = patient_data.keys
103
- x_names = cohort_hpos
104
- x_names_indx = {}
105
- cohort_hpos.each_with_index{|hp,x| x_names_indx[hp]=x}
106
- # row (y), cols (x)
107
- matrix = Numo::DFloat.zeros(patient_data.length, cohort_hpos.length)
108
- i = 0
109
- patient_data.each do |pat_id, pat_hpos|
110
- pat_hpos.each do |hp|
111
- matrix[i, x_names_indx[hp]] = 1
112
- end
113
- i += 1
1
+ require 'expcalc'
2
+ def translate_codes(clusters, hpo)
3
+ translated_clusters = []
4
+ clusters.each do |clusterID, num_of_pats, patientIDs_ary, patient_hpos_ary|
5
+ translate_codes = patient_hpos_ary.map{|patient_hpos| patient_hpos.map{|hpo_code| hpo.translate_id(hpo_code)}}
6
+ translated_clusters << [clusterID,
7
+ num_of_pats,
8
+ patientIDs_ary,
9
+ patient_hpos_ary,
10
+ translate_codes
11
+ ]
114
12
  end
115
- return matrix, y_names, x_names
13
+ return translated_clusters
116
14
  end
117
15
 
118
- def write_matrix_for_R(matrix, x_names, y_names, file)
119
- File.open(file, 'w') do |f|
120
- f.puts x_names.join("\t")
121
- matrix.each_with_index do |row, i|
122
- f.puts [y_names[i]].concat(row).join("\t")
123
- end
124
- end
125
- end
126
-
127
- def process_clustered_patients(options, clustered_patients, patient_uniq_profiles, patient_data, equivalence, hpo, phenotype_ic, patient_id_type) # get ic and chromosomes
16
+ def process_dummy_clustered_patients(options, clustered_patients, patient_data, phenotype_ic) # get ic and chromosomes
17
+ ont = Cohort.get_ontology(Cohort.act_ont)
128
18
  all_ics = []
129
19
  all_lengths = []
130
20
  top_cluster_phenotypes = []
@@ -132,30 +22,9 @@ def process_clustered_patients(options, clustered_patients, patient_uniq_profile
132
22
  multi_chromosome_patients = 0
133
23
  processed_clusters = 0
134
24
  clustered_patients.sort_by{|cl_id, pat_ids| pat_ids.length }.reverse.each do |cluster_id, patient_ids|
135
- next if patient_ids.length == 1
136
- chrs = Hash.new(0)
137
- all_phens = []
138
- profile_ics = []
139
- profile_lengths = []
140
- processed_patients = []
141
- patient_ids.each do |pat_id|
142
- phenotypes = patient_uniq_profiles[pat_id]
143
- #pat_id = pat_id.gsub(/_i\d+$/,'') if patient_id_type != 'generated'
144
- processed_patients << pat_id
145
- profile_ics << get_profile_ic(phenotypes, phenotype_ic)
146
- profile_lengths << phenotypes.length
147
- if processed_clusters < options[:clusters2show_detailed_phen_data]
148
- phen_names, rejected_codes = hpo.translate_ids(phenotypes) #optional
149
- all_phens << phen_names
150
- end
151
- variants = equivalence[pat_id]
152
- variants.each do |variant|
153
- variant_data = patient_data[variant]
154
- chrs[variant_data[CHR]] += 1 if !options[:chromosome_col].nil? && variant_data[CHR] != '-'
155
- end
156
- end
157
- num_of_patients = processed_patients.length
158
- next if num_of_patients == 1 # Check that current cluster only has one patient with several mutations
25
+ num_of_patients = patient_ids.length
26
+ next if num_of_patients == 1
27
+ chrs, all_phens, profile_ics, profile_lengths = process_cluster(patient_ids, patient_data, phenotype_ic, options, ont, processed_clusters)
159
28
  top_cluster_phenotypes << all_phens if processed_clusters < options[:clusters2show_detailed_phen_data]
160
29
  all_ics << profile_ics
161
30
  all_lengths << profile_lengths
@@ -170,167 +39,73 @@ def process_clustered_patients(options, clustered_patients, patient_uniq_profile
170
39
  return all_ics, all_lengths, cluster_data_by_chromosomes, top_cluster_phenotypes, multi_chromosome_patients
171
40
  end
172
41
 
42
+ def process_cluster(patient_ids, patient_data, phenotype_ic, options, ont, processed_clusters)
43
+ chrs = Hash.new(0)
44
+ all_phens = []
45
+ profile_ics = []
46
+ profile_lengths = []
47
+ patient_ids.each do |pat_id|
48
+ phenotypes = patient_data.get_profile(pat_id)
49
+ profile_ics << get_profile_ic(phenotypes, phenotype_ic)
50
+ profile_lengths << phenotypes.length
51
+ if processed_clusters < options[:clusters2show_detailed_phen_data]
52
+ phen_names, rejected_codes = ont.translate_ids(phenotypes) #optional
53
+ all_phens << phen_names
54
+ end
55
+ patient_data.get_vars(pat_id).get_chr.each{|chr| chrs[chr] += 1} if !options[:chromosome_col].nil?
56
+ end
57
+ return chrs, all_phens, profile_ics, profile_lengths
58
+ end
59
+
173
60
  def get_profile_ic(hpo_names, phenotype_ic)
174
61
  ic = 0
175
62
  profile_length = 0
176
63
  hpo_names.each do |hpo_id|
177
64
  hpo_ic = phenotype_ic[hpo_id]
178
- # STDERR.puts phenotype_ic.inspect
179
- ic += hpo_ic if !hpo_ic.nil?
65
+ raise("The term #{hpo_id} not exists in the given ic table") if hpo_ic.nil?
66
+ ic += hpo_ic
180
67
  profile_length += 1
181
68
  end
182
69
  profile_length = 1 if profile_length == 0
183
70
  return ic.fdiv(profile_length)
184
71
  end
185
72
 
186
- def write_cluster_ic_data(all_ics, profile_lengths, cluster_ic_data_file, limit)
187
- File.open(cluster_ic_data_file, 'w') do |f|
188
- f.puts %w[cluster_id ic Plen].join("\t")
189
- all_ics.each_with_index do |cluster_ics, i|
190
- break if i == limit
191
- cluster_length = cluster_ics.length
192
- cluster_ics.each_with_index do |clust_ic, j|
193
- f.puts "#{cluster_length}_#{i}\t#{clust_ic}\t#{profile_lengths[i][j]}"
194
- end
195
- end
196
- end
197
- end
198
-
199
- def write_cluster_chromosome_data(cluster_data, cluster_chromosome_data_file, limit)
200
- File.open(cluster_chromosome_data_file, 'w') do |f|
201
- f.puts %w[cluster_id chr count].join("\t")
202
- index = 0
203
- last_id = cluster_data.first.first unless cluster_data.empty?
204
- cluster_data.each do |cluster_id, patient_number, chr, count|
205
- index += 1 if cluster_id != last_id
206
- break if index == limit
207
- f.puts ["#{patient_number}_#{index}", chr, count].join("\t")
208
- last_id = cluster_id
209
- end
210
- end
211
- end
212
-
213
- def write_coverage_data(coverage_to_plot, coverage_to_plot_file)
214
- File.open(coverage_to_plot_file, 'w') do |f|
215
- coverage_to_plot.each do |chr, position, freq|
216
- f.puts "#{chr}\t#{position}\t#{freq}"
217
- end
218
- end
219
- end
220
-
221
- def get_uniq_hpo_profiles(patient_data) # To avoid duplications due to more one mutation in the same patient
222
- hpo_profiles = {}
223
- equivalence = {}
224
- patient_data.each do |variant_id, patient_rec|
225
- pat_id, count = variant_id.split('_i')
226
- hpo_profiles[pat_id] = patient_rec[HPOS]
227
- query = equivalence[pat_id]
228
- if query.nil?
229
- equivalence[pat_id] = [variant_id]
230
- else
231
- query << variant_id
232
- end
233
- end
234
- return hpo_profiles, equivalence
235
- end
236
-
237
- def get_patient_ids(patient_data) # To aviod duplications due to more one mutation in the same patient
238
- ids = []
239
- patient_data.each do |pat_id, hpos|
240
- id, count = pat_id.split('_i')
241
- ids << id
242
- end
243
- return ids.uniq
244
- end
245
-
246
- def get_summary_stats(patient_data, rejected_patients, cohort_hpos, hpo)
73
+ def get_summary_stats(patient_data, rejected_patients, hpo_stats, fraction_terms_specific_childs, rejected_hpos)
247
74
  stats = []
248
- stats << ['Unique HPO terms', cohort_hpos.length]
249
- stats << ['Cohort size', get_patient_ids(patient_data).length]
75
+ stats << ['Unique HPO terms', hpo_stats.length]
76
+ stats << ['Cohort size', patient_data.profiles.length]
250
77
  stats << ['Rejected patients by empty profile', rejected_patients.length]
251
- # stats << ['HPOs per patient (average)', hpo.get_profile_mean_length]
252
- stats << ['HPOs per patient (average)', hpo.get_profiles_mean_size]
253
- stats << ['HPO terms per patient: percentile 90', hpo.get_profile_length_at_percentile(perc=90)]
78
+ stats << ['HPOs per patient (average)', patient_data.get_profiles_mean_size]
79
+ stats << ['HPO terms per patient: percentile 90', patient_data.get_profile_length_at_percentile(perc=90)]
80
+ stats << ['Percentage of HPO with more specific children', (fraction_terms_specific_childs * 100).round(4)]
81
+ stats << ['DsI for uniq HP terms', patient_data.get_dataset_specifity_index('uniq')]
82
+ stats << ['DsI for frequency weigthed HP terms', patient_data.get_dataset_specifity_index('weigthed')]
83
+ stats << ['Number of unknown phenotypes', rejected_hpos.length]
254
84
  return stats
255
85
  end
256
86
 
257
- def cluster_patients(patient_data, cohort_hpos, matrix_file, clustered_patients_file)
87
+ def dummy_cluster_patients(patient_data, matrix_file, clust_pat_file)
258
88
  if !File.exists?(matrix_file)
259
- pat_hpo_matrix, pat_id, hp_id = generate_patient_hpo_matrix_numo(patient_data, cohort_hpos)
89
+ pat_hpo_matrix, pat_id, hp_id = patient_data.to_bmatrix
260
90
  x_axis_file = matrix_file.gsub('.npy','_x.lst')
261
- File.open(x_axis_file, 'w'){|f| f.print hp_id.join("\n") }
262
91
  y_axis_file = matrix_file.gsub('.npy','_y.lst')
263
- File.open(y_axis_file, 'w'){|f| f.print pat_id.join("\n") }
264
- Npy.save(matrix_file, pat_hpo_matrix)
92
+ pat_hpo_matrix.save(matrix_file, hp_id, x_axis_file, pat_id, y_axis_file)
265
93
  end
266
- system("#{File.join(EXTERNAL_CODE, 'get_clusters.R')} -d #{matrix_file} -o #{clustered_patients_file} -y #{matrix_file.gsub('.npy','')}") if !File.exists?(clustered_patients_file)
267
- clustered_patients = load_clustered_patients(clustered_patients_file)
94
+ system_call(EXTERNAL_CODE, 'get_clusters.R', "-d #{matrix_file} -o #{clust_pat_file} -y #{matrix_file.gsub('.npy','')}") if !File.exists?(clust_pat_file)
95
+ clustered_patients = load_clustered_patients(clust_pat_file)
268
96
  return(clustered_patients)
269
97
  end
270
98
 
271
- def get_profile_ontology_distribution_tables(hpo)
272
- ontology_levels, distribution_percentage = hpo.get_profile_ontology_distribution_tables
273
- ontology_levels.unshift(["level", "ontology", "cohort"])
274
- distribution_percentage.unshift(["level", "ontology", "weighted cohort", "uniq terms cohort"])
275
- return ontology_levels, distribution_percentage
276
- end
277
-
278
-
279
- def write_detailed_hpo_profile_evaluation(suggested_childs, detailed_profile_evaluation_file, summary_stats)
280
- CSV.open(detailed_profile_evaluation_file, "wb") do |csv|
281
- suggested_childs.each do |pat_id, suggestions|
282
- warning = nil
283
- warning = 'WARNING: Very few phenotypes' if suggestions.length < 4
284
- csv << ["PATIENT #{pat_id}", "#{warning}"]
285
- csv << ["CURRENT PHENOTYPES", "PUTATIVE MORE SPECIFIC PHENOTYPES"]
286
- suggestions.each do |parent, childs|
287
- parent_code, parent_name = parent
288
- if childs.empty?
289
- csv << ["#{parent_name} (#{parent_code})", '-']
290
- else
291
- parent_writed = false
292
- childs.each do |child_code, child_name|
293
- if !parent_writed
294
- parent_field = "#{parent_name} (#{parent_code})"
295
- parent_writed = true
296
- else
297
- parent_field = ""
298
- end
299
- csv << [parent_field, "#{child_name} (#{child_code})"]
300
- end
301
- end
302
- end
303
- csv << ["", ""]
99
+ def get_mean_size(all_sizes)
100
+ accumulated_size = 0
101
+ number = 0
102
+ all_sizes.each do |size, occurrences|
103
+ accumulated_size += size *occurrences
104
+ number += occurrences
304
105
  end
305
- end
106
+ return accumulated_size.fdiv(number)
306
107
  end
307
108
 
308
- def write_arrays4scatterplot(x_axis_value, y_axis_value, filename, x_axis_name, y_axis_name)
309
- File.open(filename, 'w') do |f|
310
- f.puts "#{x_axis_name}\t#{y_axis_name}"
311
- x_axis_value.each_with_index do |value,i|
312
- y_value = y_axis_value[i]
313
- raise("The #{i} position is not presented in y_axis_value") if y_value.nil?
314
- f.puts [value, y_value].join("\t")
315
- end
316
- end
317
- end
318
-
319
- def process_patient_data(patient_data)
320
- parsed_patient_data = {}
321
- patient_data.each do |patientID, metadata|
322
- phenotypes, chr, start, stop = metadata
323
- next if chr == '-'
324
- info = [patientID, start.to_i, stop.to_i]
325
- query = parsed_patient_data[chr]
326
- if query.nil?
327
- parsed_patient_data[chr] = [info]
328
- else
329
- query << info
330
- end
331
- end
332
- return parsed_patient_data
333
- end
334
109
 
335
110
  def get_final_coverage(raw_coverage, bin_size)
336
111
  coverage_to_plot = []
@@ -360,185 +135,158 @@ def get_sor_length_distribution(raw_coverage)
360
135
  return all_cnvs_length
361
136
  end
362
137
 
363
- def get_cnvs_length(patient_data)
364
- length_stats = Hash.new(0)
365
- patient_data.each do |pat_id, patient_record|
366
- string_hpos, chr, start, stop = patient_record
367
- length_stats[stop - start] += 1
368
- end
369
- return length_stats.to_a.sort!{|stat| stat[1] <=> stat[1] }
370
- end
371
-
372
-
373
138
  def calculate_coverage(regions_data, delete_thresold = 0)
374
139
  raw_coverage = {}
375
140
  n_regions = 0
376
141
  patients = 0
377
142
  nt = 0
378
- regions_data.each do |start, stop, chr, node|
379
- number_of_patients = node.split('.').last.to_i
143
+ regions_data.each do |start, stop, chr, reg_id|
144
+ number_of_patients = reg_id.split('.').last.to_i
380
145
  if number_of_patients <= delete_thresold
381
146
  number_of_patients = 0
382
147
  else
383
148
  n_regions += 1
384
- patients += number_of_patients
385
149
  nt += stop - start
386
150
  end
387
- coords = [start, stop, number_of_patients]
388
- query = raw_coverage[chr]
389
- if query.nil?
390
- raw_coverage[chr] = [coords]
391
- else
392
- query << coords
393
- end
151
+ add_record(raw_coverage, chr, [start, stop, number_of_patients])
152
+ patients += number_of_patients
394
153
  end
395
154
  return raw_coverage, n_regions, nt, patients.fdiv(n_regions)
396
155
  end
397
156
 
398
- def get_profile_redundancy(hpo)
399
- #TODO: sort both arrays consequently
400
- #TODO: bear in mind join both arrays with zip and sort by one, to get an [a[a]]
401
- # profile_sizes = hpo.get_profile_sizes
402
- profile_sizes = hpo.get_profiles_sizes
403
- # parental_hpos_per_profile = hpo.compute_redundant_parental_terms_per_profile
404
- parental_hpos_per_profile = hpo.parentals_per_profile# clean_profiles
405
- parental_hpos_per_profile = parental_hpos_per_profile.map{|item| item[0]}
406
- profile_sizes, parental_hpos_per_profile = profile_sizes.zip(parental_hpos_per_profile).sort_by{|i| i.first}.reverse.transpose
407
- return profile_sizes, parental_hpos_per_profile
408
- end
409
-
410
- def format_profiles_similarity_data(profiles_similarity)
411
- matrix = []
412
- element_names = profiles_similarity.keys
413
- matrix << element_names
414
- profiles_similarity.each do |elementA, relations|
415
- row = [elementA]
416
- element_names.each do |elementB|
417
- if elementA == elementB
418
- row << 'NA'
419
- else
420
- query = relations[elementB]
421
- if !query.nil?
422
- row << query
423
- else
424
- row << profiles_similarity[elementB][elementA]
425
- end
157
+ def get_top_dummy_clusters_stats(top_clust_phen)
158
+ new_cluster_phenotypes = {}
159
+ top_clust_phen.each_with_index do |cluster, clusterID|
160
+ phenotypes_frequency = Hash.new(0)
161
+ total_patients = cluster.length
162
+ cluster.each do |phenotypes|
163
+ phenotypes.each do |p|
164
+ phenotypes_frequency[p] += 1
426
165
  end
427
166
  end
428
- matrix << row
167
+ new_cluster_phenotypes[clusterID] = [total_patients, phenotypes_frequency.keys, phenotypes_frequency.values.map{|v| v.fdiv(total_patients) * 100}]
429
168
  end
430
- matrix[0].unshift('pat')
431
- return matrix
169
+ return new_cluster_phenotypes
432
170
  end
433
171
 
434
- def format_profiles_similarity_data_pairs(profiles_similarity)
435
- pairs = []
436
- element_names = profiles_similarity.keys
437
- profiles_similarity.each do |elementA, relations|
438
- element_names.each do |elementB|
439
- if elementA != elementB
440
- pair = [elementA, elementB]
441
- query = relations[elementB]
442
- if !query.nil?
443
- pair << query
444
- else
445
- pair << profiles_similarity[elementB][elementA]
446
- end
447
- pairs << pair
448
- end
449
- end
172
+ def remove_nested_entries(nested_hash)
173
+ empty_root_ids = []
174
+ nested_hash.each do |root_id, entries|
175
+ entries.select!{|id, val| yield(id, val)}
176
+ empty_root_ids << root_id if entries.empty?
450
177
  end
451
- return pairs
452
- end
453
-
454
- def format_profiles_similarity_data_numo(profiles_similarity)
455
- element_names = profiles_similarity.keys
456
- matrix = Numo::DFloat.zeros(element_names.length, element_names.length)
457
- i = 0
458
- profiles_similarity.each do |elementA, relations|
459
- element_names.each_with_index do |elementB, j|
460
- if elementA != elementB
461
- query = relations[elementB]
462
- if !query.nil?
463
- matrix[i, j] = query
464
- else
465
- matrix[i, j] = profiles_similarity[elementB][elementA]
466
- end
178
+ empty_root_ids.each{|id| nested_hash.delete(id)}
179
+ end
180
+
181
+ def get_semantic_similarity_clustering(options, patient_data, temp_folder)
182
+ template = File.open(File.join(REPORT_FOLDER, 'cluster_report.erb')).read
183
+ hpo = Cohort.get_ontology(Cohort.act_ont)
184
+ reference_profiles = nil
185
+ reference_profiles = load_profiles(options[:reference_profiles], hpo) if !options[:reference_profiles].nil?
186
+ Parallel.each(options[:clustering_methods], in_processes: options[:threads] ) do |method_name|
187
+ matrix_filename = File.join(temp_folder, "similarity_matrix_#{method_name}.npy")
188
+ profiles_similarity_filename = File.join(temp_folder, ['profiles_similarity', method_name].join('_').concat('.txt'))
189
+ clusters_distribution_filename = File.join(temp_folder, ['clusters_distribution', method_name].join('_').concat('.txt'))
190
+ if !File.exists?(matrix_filename)
191
+ if reference_profiles.nil?
192
+ profiles_similarity = patient_data.compare_profiles(sim_type: method_name.to_sym, external_profiles: reference_profiles)
193
+ else # AS reference profiles are constant, the sematic comparation will be A => B (A reference). So, we have to invert the elements to perform the comparation
194
+ ont = Cohort.get_ontology(:hpo)
195
+ pat_profiles = ont.profiles
196
+ ont.load_profiles(reference_profiles, reset_stored: true)
197
+ profiles_similarity = ont.compare_profiles(sim_type: method_name.to_sym,
198
+ external_profiles: pat_profiles,
199
+ bidirectional: false)
200
+ ont.load_profiles(pat_profiles, reset_stored: true)
201
+ profiles_similarity = invert_nested_hash(profiles_similarity)
202
+ end
203
+ remove_nested_entries(profiles_similarity){|id, sim| sim >= options[:sim_thr] } if !options[:sim_thr].nil?
204
+ write_profile_pairs(profiles_similarity, profiles_similarity_filename)
205
+ if reference_profiles.nil?
206
+ axis_file = matrix_filename.gsub('.npy','.lst')
207
+ similarity_matrix, axis_names = profiles_similarity.to_wmatrix(squared: true, symm: true)
208
+ similarity_matrix.save(matrix_filename, axis_names, axis_file)
209
+ else
210
+ axis_file_x = matrix_filename.gsub('.npy','_x.lst')
211
+ axis_file_y = matrix_filename.gsub('.npy','_y.lst')
212
+ similarity_matrix, y_names, x_names = profiles_similarity.to_wmatrix(squared: false, symm: true)
213
+ similarity_matrix.save(matrix_filename, y_names, axis_file_y, x_names, axis_file_x)
467
214
  end
468
215
  end
469
- i += 1
470
- end
471
- return matrix, element_names
472
- end
473
-
474
- def write_similarity_matrix(similarity_matrix, similarity_matrix_file)
475
- File.open(similarity_matrix_file, 'w') do |f|
476
- similarity_matrix.each do |row|
477
- f.puts row.join("\t")
216
+ ext_var = ''
217
+ if method_name == 'resnik'
218
+ ext_var = '-m max'
219
+ elsif method_name == 'lin'
220
+ ext_var = '-m comp1'
478
221
  end
479
- end
480
- end
481
-
482
- def write_profile_pairs(similarity_pairs, filename)
483
- File.open(filename, 'w') do |f|
484
- similarity_pairs.each do |pairsA, pairsB_and_values|
485
- pairsB_and_values.each do |pairsB, values|
486
- f.puts "#{pairsA}\t#{pairsB}\t#{values}"
222
+ cluster_file = "#{method_name}_clusters.txt"
223
+ if !reference_profiles.nil?
224
+ ext_var << ' -s'
225
+ axis_file = "#{axis_file_y},#{axis_file_x}"
226
+ cluster_file = "#{method_name}_clusters_rows.txt"
227
+ end
228
+ out_file = File.join(temp_folder, method_name)
229
+ system_call(EXTERNAL_CODE, 'plot_heatmap.R', "-y #{axis_file} -d #{matrix_filename} -o #{out_file} -M #{options[:minClusterProportion]} -t dynamic -H #{ext_var}") if !File.exists?(out_file + '_heatmap.png')
230
+ clusters_codes, clusters_info = parse_clusters_file(File.join(temp_folder, cluster_file), patient_data)
231
+ write_patient_hpo_stat(get_cluster_metadata(clusters_info), clusters_distribution_filename)
232
+ out_file = File.join(temp_folder, ['clusters_distribution', method_name].join('_'))
233
+ system_call(EXTERNAL_CODE, 'xyplot_graph.R', "-d #{clusters_distribution_filename} -o #{out_file} -x PatientsNumber -y HPOAverage") if !File.exists?(out_file)
234
+ sim_mat4cluster = {}
235
+ if options[:detailed_clusters]
236
+ clusters_codes.each do |cluster|
237
+ cluster_cohort = Cohort.new
238
+ clID, patient_number, patient_ids, hpo_codes = cluster
239
+ patient_ids.each_with_index {|patID, i| cluster_cohort.add_record([patID, hpo_codes[i], []])}
240
+ cluster_profiles = cluster_cohort.profiles
241
+ ref_profile = cluster_cohort.get_general_profile
242
+ hpo.load_profiles({ref: ref_profile}, reset_stored: true)
243
+ similarities = hpo.compare_profiles(external_profiles: cluster_profiles, sim_type: :lin, bidirectional: false)
244
+ candidate_sim_matrix, candidates, candidates_ids = get_similarity_matrix(ref_profile, similarities[:ref], cluster_profiles, hpo, 100, 100)
245
+ candidate_sim_matrix.unshift(['HP'] + candidates_ids)
246
+ sim_mat4cluster[clID] = candidate_sim_matrix
487
247
  end
488
248
  end
489
- end
490
- end
491
249
 
492
- def parse_clusters_file(clusters_file, patient_profiles)
493
- clusters_info = {}
494
- clusters_table = []
495
- File.open(clusters_file).each do |line|
496
- line.chomp!
497
- patientID, clusterID = line.split("\t")
498
- patientHPOProfile = patient_profiles[patientID]
499
- query = clusters_info[clusterID]
500
- if query.nil?
501
- clusters_info[clusterID] = {patientID => patientHPOProfile}
502
- else
503
- query[patientID] = patientHPOProfile
504
- end
505
- end
506
- clusters_info.each do |clusterID, patients_info|
507
- patients_per_cluster = patients_info.keys.length
508
- clusters_table << [clusterID, patients_per_cluster, patients_info.keys, patients_info.values]
250
+
251
+ clusters = translate_codes(clusters_codes, hpo)
252
+ container = {
253
+ :temp_folder => temp_folder,
254
+ :cluster_name => method_name,
255
+ :clusters => clusters,
256
+ :hpo => hpo,
257
+ :sim_mat4cluster => sim_mat4cluster
258
+ }
259
+
260
+ report = Report_html.new(container, 'Patient clusters report')
261
+ report.build(template)
262
+ report.write(options[:output_file]+"_#{method_name}_clusters.html")
263
+ system_call(EXTERNAL_CODE, 'generate_boxpot.R', "-i #{temp_folder} -m #{method_name} -o #{File.join(temp_folder, method_name + '_sim_boxplot')}") if !File.exists?(File.join(temp_folder, 'sim_boxplot.png'))
509
264
  end
510
- return clusters_table, clusters_info
511
265
  end
512
266
 
513
- def get_patient_hpo_frequency(patient_uniq_profiles, hpo_frequency_file)
514
- hpo_frequency = Hash.new(0)
515
- patient_uniq_profiles.values.each do |hpos|
516
- hpos.each do |hpo|
517
- hpo_frequency[hpo] += 1
518
- end
519
- end
520
- File.open(hpo_frequency_file, 'w') do |f|
521
- hpo_frequency.each do |hpo_code, freq|
522
- f.puts "#{hpo_code.to_s}\t#{freq}"
267
+ def invert_nested_hash(h)
268
+ new_h = {}
269
+ h.each do |k1, vals1|
270
+ vals1.each do |v1|
271
+ vals1.each do |k2, vals2|
272
+ query = new_h[k2]
273
+ if query.nil?
274
+ new_h[k2] = {k1 => vals2}
275
+ else
276
+ query[k1] = vals2
277
+ end
278
+ end
523
279
  end
524
280
  end
281
+ return new_h
525
282
  end
526
283
 
527
- def get_cluster_metadata(clusters_info, output_file)
284
+ def get_cluster_metadata(clusters_info)
528
285
  average_hp_per_pat_distribution = []
529
- tmp = []
530
286
  clusters_info.each do |cl_id, pat_info|
531
287
  hp_per_pat_in_clust = pat_info.values.map{|a| a.length}
532
288
  hp_per_pat_ave = hp_per_pat_in_clust.sum.fdiv(hp_per_pat_in_clust.length)
533
289
  average_hp_per_pat_distribution << [pat_info.length, hp_per_pat_ave]
534
- tmp << hp_per_pat_in_clust
535
- end
536
- total_clusters = clusters_info.length
537
- average_phenotypes_by_cluster = tmp.flatten.sum.fdiv(total_clusters)
538
- File.open(output_file, 'w') do |f|
539
- f.puts "#{'PatientsNumber'}\t#{'HPOAverage'}"
540
- average_hp_per_pat_distribution.each do |patient_num, ave|
541
- f.puts "#{patient_num}\t#{ave}"
542
- end
543
290
  end
291
+ return average_hp_per_pat_distribution
544
292
  end