pets 0.2.3 → 0.2.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile +2 -0
  3. data/README.md +79 -5
  4. data/bin/coPatReporter.rb +68 -156
  5. data/bin/comPatMondo.rb +1 -4
  6. data/bin/evidence_profiler.rb +102 -150
  7. data/bin/get_gen_features.rb +146 -0
  8. data/bin/get_network_nodes.rb +79 -132
  9. data/bin/get_sorted_profs.rb +25 -36
  10. data/bin/install_deps.rb +8 -0
  11. data/bin/paco_translator.rb +29 -72
  12. data/bin/phen2reg.rb +1 -4
  13. data/bin/profiles2phenopacket.rb +86 -0
  14. data/bin/reg2phen.rb +1 -3
  15. data/example_datasets/associations_file.txt +757 -0
  16. data/example_datasets/example_patient.txt +6 -0
  17. data/example_datasets/example_patient_hpos.txt +15 -0
  18. data/example_datasets/genes.txt +8 -0
  19. data/example_datasets/hpo2ci.txt +2798 -0
  20. data/example_datasets/hummu_congenital_full_dataset.txt +4183 -0
  21. data/example_datasets/launch.sh +20 -0
  22. data/external_code/generate_boxpot.R +51 -21
  23. data/external_code/get_clusters.R +2 -2
  24. data/external_code/install_R_dependencies.R +16 -0
  25. data/external_code/plot_heatmap.R +34 -30
  26. data/lib/pets/coPatReporterMethods.rb +172 -424
  27. data/lib/pets/cohort.rb +309 -0
  28. data/lib/pets/common_optparse.rb +30 -0
  29. data/lib/pets/constants.rb +8 -0
  30. data/lib/pets/generalMethods.rb +29 -319
  31. data/lib/pets/genomic_features.rb +240 -0
  32. data/lib/pets/io.rb +481 -0
  33. data/lib/pets/parsers/cohort_parser.rb +111 -0
  34. data/lib/pets/parsers/reference_parser.rb +39 -0
  35. data/lib/pets/version.rb +1 -1
  36. data/lib/pets.rb +9 -0
  37. data/pets.gemspec +7 -3
  38. data/templates/cluster_report.erb +25 -5
  39. data/templates/cohort_report.erb +5 -7
  40. data/templates/evidence_profile.erb +20 -4
  41. data/templates/patient_report.erb +1 -1
  42. metadata +96 -5
@@ -1,130 +1,20 @@
1
- require 'numo/narray'
2
- require 'semtools'
3
-
4
- HPOS = 0
5
- CHR = 1
6
- START = 2
7
- STOP = 3
8
-
9
- def load_hpo_ontology(hpo_file, excluded_hpo_file)
10
- hpo = nil
11
- if !hpo_file.include?('.json')
12
- if !excluded_hpo_file.nil?
13
- hpo = Ontology.new(file: hpo_file, load_file: true, removable_terms: read_excluded_hpo_file(excluded_hpo_file))
14
- else
15
- hpo = Ontology.new(file: hpo_file, load_file: true)
16
- end
17
- else
18
- hpo = Ontology.new
19
- hpo.read(hpo_file)
20
- if !excluded_hpo_file.nil?
21
- hpo.add_removable_terms(read_excluded_hpo_file(excluded_hpo_file))
22
- hpo.remove_removable()
23
- hpo.build_index()
24
- end
25
- end
26
- return hpo
27
- end
28
-
29
- def format_patient_data(patient_data, options, hpo)
30
- rejected_hpos = []
31
- rejected_patients = []
32
- patient_data.each do |pat_id, patient_record|
33
- hpos, chr, start, stop = patient_record
34
-
35
- if options[:hpo_names]
36
- hpos, pat_rejected_hpos = hpo.translate_names(hpos)
37
- if !pat_rejected_hpos.empty?
38
- STDERR.puts "WARNING: patient #{pat_id} has the unknown hpo NAMES '#{pat_rejected_hpos.join(',')}'. Rejected."
39
- rejected_hpos.concat(pat_rejected_hpos)
40
- end
41
- end
42
-
43
- hpos, pat_rejected_hpos = hpo.check_ids(hpos.map{|a| a.to_sym})
44
- if !pat_rejected_hpos.empty?
45
- STDERR.puts "WARNING: patient #{pat_id} has the unknown hpo CODES '#{pat_rejected_hpos.join(',')}'. Rejected."
46
- rejected_hpos.concat(pat_rejected_hpos)
47
- end
48
- if hpos.empty?
49
- rejected_patients << pat_id
50
- else
51
- patient_record[HPOS] = hpos
52
- end
53
- end
54
- return rejected_hpos.uniq, rejected_patients
55
- end
56
-
57
- def compute_hpo_list_and_childs(patient_data, hpo)
58
- all_hpo = []
59
- suggested_childs = {}
60
- total_terms = 0
61
- terms_with_more_specific_childs = 0
62
- patient_data.each do |pat_id, hpos|
63
- total_terms += hpos.length
64
- more_specific_childs = hpo.get_childs_table(hpos, true)
65
- terms_with_more_specific_childs += more_specific_childs.select{|hpo_record| !hpo_record.last.empty?}.length #Exclude phenotypes with no childs
66
- suggested_childs[pat_id] = more_specific_childs
67
- all_hpo.concat(hpos)
68
- end
69
- return all_hpo.uniq, suggested_childs, terms_with_more_specific_childs.fdiv(total_terms)
70
- end
71
-
72
- def clean_patient_profiles(hpo, patient_profiles)
73
- rejected_patients = []
74
- patient_profiles.each do |pat, prof|
75
- phens = hpo.clean_profile_hard(prof)
76
- if phens.empty?
77
- rejected_patients << pat
78
- else
79
- patient_profiles[pat] = phens
80
- end
81
- end
82
- patient_profiles.select!{|pat_id, patient_record| !rejected_patients.include?(pat_id)}
83
- hpo.profiles = {}
84
- hpo.load_profiles(patient_profiles)
85
-
86
- end
87
-
88
- def generate_patient_hpo_matrix(patient_data, cohort_hpos)
89
- matrix = []
90
- n = cohort_hpos.length
91
- patient_data.each do |pat_id, pat_hpos|
92
- vector = Array.new(n, 0)
93
- pat_hpos.each do |hpo|
94
- vector[cohort_hpos.index(hpo)] = 1
95
- end
96
- matrix << vector
97
- end
98
- return matrix
99
- end
100
-
101
- def generate_patient_hpo_matrix_numo(patient_data, cohort_hpos)
102
- y_names = patient_data.keys
103
- x_names = cohort_hpos
104
- x_names_indx = {}
105
- cohort_hpos.each_with_index{|hp,x| x_names_indx[hp]=x}
106
- # row (y), cols (x)
107
- matrix = Numo::DFloat.zeros(patient_data.length, cohort_hpos.length)
108
- i = 0
109
- patient_data.each do |pat_id, pat_hpos|
110
- pat_hpos.each do |hp|
111
- matrix[i, x_names_indx[hp]] = 1
112
- end
113
- i += 1
1
+ require 'expcalc'
2
+ def translate_codes(clusters, hpo)
3
+ translated_clusters = []
4
+ clusters.each do |clusterID, num_of_pats, patientIDs_ary, patient_hpos_ary|
5
+ translate_codes = patient_hpos_ary.map{|patient_hpos| patient_hpos.map{|hpo_code| hpo.translate_id(hpo_code)}}
6
+ translated_clusters << [clusterID,
7
+ num_of_pats,
8
+ patientIDs_ary,
9
+ patient_hpos_ary,
10
+ translate_codes
11
+ ]
114
12
  end
115
- return matrix, y_names, x_names
13
+ return translated_clusters
116
14
  end
117
15
 
118
- def write_matrix_for_R(matrix, x_names, y_names, file)
119
- File.open(file, 'w') do |f|
120
- f.puts x_names.join("\t")
121
- matrix.each_with_index do |row, i|
122
- f.puts [y_names[i]].concat(row).join("\t")
123
- end
124
- end
125
- end
126
-
127
- def process_clustered_patients(options, clustered_patients, patient_uniq_profiles, patient_data, equivalence, hpo, phenotype_ic, patient_id_type) # get ic and chromosomes
16
+ def process_dummy_clustered_patients(options, clustered_patients, patient_data, phenotype_ic) # get ic and chromosomes
17
+ ont = Cohort.get_ontology(Cohort.act_ont)
128
18
  all_ics = []
129
19
  all_lengths = []
130
20
  top_cluster_phenotypes = []
@@ -132,30 +22,9 @@ def process_clustered_patients(options, clustered_patients, patient_uniq_profile
132
22
  multi_chromosome_patients = 0
133
23
  processed_clusters = 0
134
24
  clustered_patients.sort_by{|cl_id, pat_ids| pat_ids.length }.reverse.each do |cluster_id, patient_ids|
135
- next if patient_ids.length == 1
136
- chrs = Hash.new(0)
137
- all_phens = []
138
- profile_ics = []
139
- profile_lengths = []
140
- processed_patients = []
141
- patient_ids.each do |pat_id|
142
- phenotypes = patient_uniq_profiles[pat_id]
143
- #pat_id = pat_id.gsub(/_i\d+$/,'') if patient_id_type != 'generated'
144
- processed_patients << pat_id
145
- profile_ics << get_profile_ic(phenotypes, phenotype_ic)
146
- profile_lengths << phenotypes.length
147
- if processed_clusters < options[:clusters2show_detailed_phen_data]
148
- phen_names, rejected_codes = hpo.translate_ids(phenotypes) #optional
149
- all_phens << phen_names
150
- end
151
- variants = equivalence[pat_id]
152
- variants.each do |variant|
153
- variant_data = patient_data[variant]
154
- chrs[variant_data[CHR]] += 1 if !options[:chromosome_col].nil? && variant_data[CHR] != '-'
155
- end
156
- end
157
- num_of_patients = processed_patients.length
158
- next if num_of_patients == 1 # Check that current cluster only has one patient with several mutations
25
+ num_of_patients = patient_ids.length
26
+ next if num_of_patients == 1
27
+ chrs, all_phens, profile_ics, profile_lengths = process_cluster(patient_ids, patient_data, phenotype_ic, options, ont, processed_clusters)
159
28
  top_cluster_phenotypes << all_phens if processed_clusters < options[:clusters2show_detailed_phen_data]
160
29
  all_ics << profile_ics
161
30
  all_lengths << profile_lengths
@@ -170,167 +39,73 @@ def process_clustered_patients(options, clustered_patients, patient_uniq_profile
170
39
  return all_ics, all_lengths, cluster_data_by_chromosomes, top_cluster_phenotypes, multi_chromosome_patients
171
40
  end
172
41
 
42
+ def process_cluster(patient_ids, patient_data, phenotype_ic, options, ont, processed_clusters)
43
+ chrs = Hash.new(0)
44
+ all_phens = []
45
+ profile_ics = []
46
+ profile_lengths = []
47
+ patient_ids.each do |pat_id|
48
+ phenotypes = patient_data.get_profile(pat_id)
49
+ profile_ics << get_profile_ic(phenotypes, phenotype_ic)
50
+ profile_lengths << phenotypes.length
51
+ if processed_clusters < options[:clusters2show_detailed_phen_data]
52
+ phen_names, rejected_codes = ont.translate_ids(phenotypes) #optional
53
+ all_phens << phen_names
54
+ end
55
+ patient_data.get_vars(pat_id).get_chr.each{|chr| chrs[chr] += 1} if !options[:chromosome_col].nil?
56
+ end
57
+ return chrs, all_phens, profile_ics, profile_lengths
58
+ end
59
+
173
60
  def get_profile_ic(hpo_names, phenotype_ic)
174
61
  ic = 0
175
62
  profile_length = 0
176
63
  hpo_names.each do |hpo_id|
177
64
  hpo_ic = phenotype_ic[hpo_id]
178
- # STDERR.puts phenotype_ic.inspect
179
- ic += hpo_ic if !hpo_ic.nil?
65
+ raise("The term #{hpo_id} not exists in the given ic table") if hpo_ic.nil?
66
+ ic += hpo_ic
180
67
  profile_length += 1
181
68
  end
182
69
  profile_length = 1 if profile_length == 0
183
70
  return ic.fdiv(profile_length)
184
71
  end
185
72
 
186
- def write_cluster_ic_data(all_ics, profile_lengths, cluster_ic_data_file, limit)
187
- File.open(cluster_ic_data_file, 'w') do |f|
188
- f.puts %w[cluster_id ic Plen].join("\t")
189
- all_ics.each_with_index do |cluster_ics, i|
190
- break if i == limit
191
- cluster_length = cluster_ics.length
192
- cluster_ics.each_with_index do |clust_ic, j|
193
- f.puts "#{cluster_length}_#{i}\t#{clust_ic}\t#{profile_lengths[i][j]}"
194
- end
195
- end
196
- end
197
- end
198
-
199
- def write_cluster_chromosome_data(cluster_data, cluster_chromosome_data_file, limit)
200
- File.open(cluster_chromosome_data_file, 'w') do |f|
201
- f.puts %w[cluster_id chr count].join("\t")
202
- index = 0
203
- last_id = cluster_data.first.first unless cluster_data.empty?
204
- cluster_data.each do |cluster_id, patient_number, chr, count|
205
- index += 1 if cluster_id != last_id
206
- break if index == limit
207
- f.puts ["#{patient_number}_#{index}", chr, count].join("\t")
208
- last_id = cluster_id
209
- end
210
- end
211
- end
212
-
213
- def write_coverage_data(coverage_to_plot, coverage_to_plot_file)
214
- File.open(coverage_to_plot_file, 'w') do |f|
215
- coverage_to_plot.each do |chr, position, freq|
216
- f.puts "#{chr}\t#{position}\t#{freq}"
217
- end
218
- end
219
- end
220
-
221
- def get_uniq_hpo_profiles(patient_data) # To avoid duplications due to more one mutation in the same patient
222
- hpo_profiles = {}
223
- equivalence = {}
224
- patient_data.each do |variant_id, patient_rec|
225
- pat_id, count = variant_id.split('_i')
226
- hpo_profiles[pat_id] = patient_rec[HPOS]
227
- query = equivalence[pat_id]
228
- if query.nil?
229
- equivalence[pat_id] = [variant_id]
230
- else
231
- query << variant_id
232
- end
233
- end
234
- return hpo_profiles, equivalence
235
- end
236
-
237
- def get_patient_ids(patient_data) # To aviod duplications due to more one mutation in the same patient
238
- ids = []
239
- patient_data.each do |pat_id, hpos|
240
- id, count = pat_id.split('_i')
241
- ids << id
242
- end
243
- return ids.uniq
244
- end
245
-
246
- def get_summary_stats(patient_data, rejected_patients, cohort_hpos, hpo)
73
+ def get_summary_stats(patient_data, rejected_patients, hpo_stats, fraction_terms_specific_childs, rejected_hpos)
247
74
  stats = []
248
- stats << ['Unique HPO terms', cohort_hpos.length]
249
- stats << ['Cohort size', get_patient_ids(patient_data).length]
75
+ stats << ['Unique HPO terms', hpo_stats.length]
76
+ stats << ['Cohort size', patient_data.profiles.length]
250
77
  stats << ['Rejected patients by empty profile', rejected_patients.length]
251
- # stats << ['HPOs per patient (average)', hpo.get_profile_mean_length]
252
- stats << ['HPOs per patient (average)', hpo.get_profiles_mean_size]
253
- stats << ['HPO terms per patient: percentile 90', hpo.get_profile_length_at_percentile(perc=90)]
78
+ stats << ['HPOs per patient (average)', patient_data.get_profiles_mean_size]
79
+ stats << ['HPO terms per patient: percentile 90', patient_data.get_profile_length_at_percentile(perc=90)]
80
+ stats << ['Percentage of HPO with more specific children', (fraction_terms_specific_childs * 100).round(4)]
81
+ stats << ['DsI for uniq HP terms', patient_data.get_dataset_specifity_index('uniq')]
82
+ stats << ['DsI for frequency weigthed HP terms', patient_data.get_dataset_specifity_index('weigthed')]
83
+ stats << ['Number of unknown phenotypes', rejected_hpos.length]
254
84
  return stats
255
85
  end
256
86
 
257
- def cluster_patients(patient_data, cohort_hpos, matrix_file, clustered_patients_file)
87
+ def dummy_cluster_patients(patient_data, matrix_file, clust_pat_file)
258
88
  if !File.exists?(matrix_file)
259
- pat_hpo_matrix, pat_id, hp_id = generate_patient_hpo_matrix_numo(patient_data, cohort_hpos)
89
+ pat_hpo_matrix, pat_id, hp_id = patient_data.to_bmatrix
260
90
  x_axis_file = matrix_file.gsub('.npy','_x.lst')
261
- File.open(x_axis_file, 'w'){|f| f.print hp_id.join("\n") }
262
91
  y_axis_file = matrix_file.gsub('.npy','_y.lst')
263
- File.open(y_axis_file, 'w'){|f| f.print pat_id.join("\n") }
264
- Npy.save(matrix_file, pat_hpo_matrix)
92
+ pat_hpo_matrix.save(matrix_file, hp_id, x_axis_file, pat_id, y_axis_file)
265
93
  end
266
- system("#{File.join(EXTERNAL_CODE, 'get_clusters.R')} -d #{matrix_file} -o #{clustered_patients_file} -y #{matrix_file.gsub('.npy','')}") if !File.exists?(clustered_patients_file)
267
- clustered_patients = load_clustered_patients(clustered_patients_file)
94
+ system_call(EXTERNAL_CODE, 'get_clusters.R', "-d #{matrix_file} -o #{clust_pat_file} -y #{matrix_file.gsub('.npy','')}") if !File.exists?(clust_pat_file)
95
+ clustered_patients = load_clustered_patients(clust_pat_file)
268
96
  return(clustered_patients)
269
97
  end
270
98
 
271
- def get_profile_ontology_distribution_tables(hpo)
272
- ontology_levels, distribution_percentage = hpo.get_profile_ontology_distribution_tables
273
- ontology_levels.unshift(["level", "ontology", "cohort"])
274
- distribution_percentage.unshift(["level", "ontology", "weighted cohort", "uniq terms cohort"])
275
- return ontology_levels, distribution_percentage
276
- end
277
-
278
-
279
- def write_detailed_hpo_profile_evaluation(suggested_childs, detailed_profile_evaluation_file, summary_stats)
280
- CSV.open(detailed_profile_evaluation_file, "wb") do |csv|
281
- suggested_childs.each do |pat_id, suggestions|
282
- warning = nil
283
- warning = 'WARNING: Very few phenotypes' if suggestions.length < 4
284
- csv << ["PATIENT #{pat_id}", "#{warning}"]
285
- csv << ["CURRENT PHENOTYPES", "PUTATIVE MORE SPECIFIC PHENOTYPES"]
286
- suggestions.each do |parent, childs|
287
- parent_code, parent_name = parent
288
- if childs.empty?
289
- csv << ["#{parent_name} (#{parent_code})", '-']
290
- else
291
- parent_writed = false
292
- childs.each do |child_code, child_name|
293
- if !parent_writed
294
- parent_field = "#{parent_name} (#{parent_code})"
295
- parent_writed = true
296
- else
297
- parent_field = ""
298
- end
299
- csv << [parent_field, "#{child_name} (#{child_code})"]
300
- end
301
- end
302
- end
303
- csv << ["", ""]
99
+ def get_mean_size(all_sizes)
100
+ accumulated_size = 0
101
+ number = 0
102
+ all_sizes.each do |size, occurrences|
103
+ accumulated_size += size *occurrences
104
+ number += occurrences
304
105
  end
305
- end
106
+ return accumulated_size.fdiv(number)
306
107
  end
307
108
 
308
- def write_arrays4scatterplot(x_axis_value, y_axis_value, filename, x_axis_name, y_axis_name)
309
- File.open(filename, 'w') do |f|
310
- f.puts "#{x_axis_name}\t#{y_axis_name}"
311
- x_axis_value.each_with_index do |value,i|
312
- y_value = y_axis_value[i]
313
- raise("The #{i} position is not presented in y_axis_value") if y_value.nil?
314
- f.puts [value, y_value].join("\t")
315
- end
316
- end
317
- end
318
-
319
- def process_patient_data(patient_data)
320
- parsed_patient_data = {}
321
- patient_data.each do |patientID, metadata|
322
- phenotypes, chr, start, stop = metadata
323
- next if chr == '-'
324
- info = [patientID, start.to_i, stop.to_i]
325
- query = parsed_patient_data[chr]
326
- if query.nil?
327
- parsed_patient_data[chr] = [info]
328
- else
329
- query << info
330
- end
331
- end
332
- return parsed_patient_data
333
- end
334
109
 
335
110
  def get_final_coverage(raw_coverage, bin_size)
336
111
  coverage_to_plot = []
@@ -360,185 +135,158 @@ def get_sor_length_distribution(raw_coverage)
360
135
  return all_cnvs_length
361
136
  end
362
137
 
363
- def get_cnvs_length(patient_data)
364
- length_stats = Hash.new(0)
365
- patient_data.each do |pat_id, patient_record|
366
- string_hpos, chr, start, stop = patient_record
367
- length_stats[stop - start] += 1
368
- end
369
- return length_stats.to_a.sort!{|stat| stat[1] <=> stat[1] }
370
- end
371
-
372
-
373
138
  def calculate_coverage(regions_data, delete_thresold = 0)
374
139
  raw_coverage = {}
375
140
  n_regions = 0
376
141
  patients = 0
377
142
  nt = 0
378
- regions_data.each do |start, stop, chr, node|
379
- number_of_patients = node.split('.').last.to_i
143
+ regions_data.each do |start, stop, chr, reg_id|
144
+ number_of_patients = reg_id.split('.').last.to_i
380
145
  if number_of_patients <= delete_thresold
381
146
  number_of_patients = 0
382
147
  else
383
148
  n_regions += 1
384
- patients += number_of_patients
385
149
  nt += stop - start
386
150
  end
387
- coords = [start, stop, number_of_patients]
388
- query = raw_coverage[chr]
389
- if query.nil?
390
- raw_coverage[chr] = [coords]
391
- else
392
- query << coords
393
- end
151
+ add_record(raw_coverage, chr, [start, stop, number_of_patients])
152
+ patients += number_of_patients
394
153
  end
395
154
  return raw_coverage, n_regions, nt, patients.fdiv(n_regions)
396
155
  end
397
156
 
398
- def get_profile_redundancy(hpo)
399
- #TODO: sort both arrays consequently
400
- #TODO: bear in mind join both arrays with zip and sort by one, to get an [a[a]]
401
- # profile_sizes = hpo.get_profile_sizes
402
- profile_sizes = hpo.get_profiles_sizes
403
- # parental_hpos_per_profile = hpo.compute_redundant_parental_terms_per_profile
404
- parental_hpos_per_profile = hpo.parentals_per_profile# clean_profiles
405
- parental_hpos_per_profile = parental_hpos_per_profile.map{|item| item[0]}
406
- profile_sizes, parental_hpos_per_profile = profile_sizes.zip(parental_hpos_per_profile).sort_by{|i| i.first}.reverse.transpose
407
- return profile_sizes, parental_hpos_per_profile
408
- end
409
-
410
- def format_profiles_similarity_data(profiles_similarity)
411
- matrix = []
412
- element_names = profiles_similarity.keys
413
- matrix << element_names
414
- profiles_similarity.each do |elementA, relations|
415
- row = [elementA]
416
- element_names.each do |elementB|
417
- if elementA == elementB
418
- row << 'NA'
419
- else
420
- query = relations[elementB]
421
- if !query.nil?
422
- row << query
423
- else
424
- row << profiles_similarity[elementB][elementA]
425
- end
157
+ def get_top_dummy_clusters_stats(top_clust_phen)
158
+ new_cluster_phenotypes = {}
159
+ top_clust_phen.each_with_index do |cluster, clusterID|
160
+ phenotypes_frequency = Hash.new(0)
161
+ total_patients = cluster.length
162
+ cluster.each do |phenotypes|
163
+ phenotypes.each do |p|
164
+ phenotypes_frequency[p] += 1
426
165
  end
427
166
  end
428
- matrix << row
167
+ new_cluster_phenotypes[clusterID] = [total_patients, phenotypes_frequency.keys, phenotypes_frequency.values.map{|v| v.fdiv(total_patients) * 100}]
429
168
  end
430
- matrix[0].unshift('pat')
431
- return matrix
169
+ return new_cluster_phenotypes
432
170
  end
433
171
 
434
- def format_profiles_similarity_data_pairs(profiles_similarity)
435
- pairs = []
436
- element_names = profiles_similarity.keys
437
- profiles_similarity.each do |elementA, relations|
438
- element_names.each do |elementB|
439
- if elementA != elementB
440
- pair = [elementA, elementB]
441
- query = relations[elementB]
442
- if !query.nil?
443
- pair << query
444
- else
445
- pair << profiles_similarity[elementB][elementA]
446
- end
447
- pairs << pair
448
- end
449
- end
172
+ def remove_nested_entries(nested_hash)
173
+ empty_root_ids = []
174
+ nested_hash.each do |root_id, entries|
175
+ entries.select!{|id, val| yield(id, val)}
176
+ empty_root_ids << root_id if entries.empty?
450
177
  end
451
- return pairs
452
- end
453
-
454
- def format_profiles_similarity_data_numo(profiles_similarity)
455
- element_names = profiles_similarity.keys
456
- matrix = Numo::DFloat.zeros(element_names.length, element_names.length)
457
- i = 0
458
- profiles_similarity.each do |elementA, relations|
459
- element_names.each_with_index do |elementB, j|
460
- if elementA != elementB
461
- query = relations[elementB]
462
- if !query.nil?
463
- matrix[i, j] = query
464
- else
465
- matrix[i, j] = profiles_similarity[elementB][elementA]
466
- end
178
+ empty_root_ids.each{|id| nested_hash.delete(id)}
179
+ end
180
+
181
+ def get_semantic_similarity_clustering(options, patient_data, temp_folder)
182
+ template = File.open(File.join(REPORT_FOLDER, 'cluster_report.erb')).read
183
+ hpo = Cohort.get_ontology(Cohort.act_ont)
184
+ reference_profiles = nil
185
+ reference_profiles = load_profiles(options[:reference_profiles], hpo) if !options[:reference_profiles].nil?
186
+ Parallel.each(options[:clustering_methods], in_processes: options[:threads] ) do |method_name|
187
+ matrix_filename = File.join(temp_folder, "similarity_matrix_#{method_name}.npy")
188
+ profiles_similarity_filename = File.join(temp_folder, ['profiles_similarity', method_name].join('_').concat('.txt'))
189
+ clusters_distribution_filename = File.join(temp_folder, ['clusters_distribution', method_name].join('_').concat('.txt'))
190
+ if !File.exists?(matrix_filename)
191
+ if reference_profiles.nil?
192
+ profiles_similarity = patient_data.compare_profiles(sim_type: method_name.to_sym, external_profiles: reference_profiles)
193
+ else # AS reference profiles are constant, the sematic comparation will be A => B (A reference). So, we have to invert the elements to perform the comparation
194
+ ont = Cohort.get_ontology(:hpo)
195
+ pat_profiles = ont.profiles
196
+ ont.load_profiles(reference_profiles, reset_stored: true)
197
+ profiles_similarity = ont.compare_profiles(sim_type: method_name.to_sym,
198
+ external_profiles: pat_profiles,
199
+ bidirectional: false)
200
+ ont.load_profiles(pat_profiles, reset_stored: true)
201
+ profiles_similarity = invert_nested_hash(profiles_similarity)
202
+ end
203
+ remove_nested_entries(profiles_similarity){|id, sim| sim >= options[:sim_thr] } if !options[:sim_thr].nil?
204
+ write_profile_pairs(profiles_similarity, profiles_similarity_filename)
205
+ if reference_profiles.nil?
206
+ axis_file = matrix_filename.gsub('.npy','.lst')
207
+ similarity_matrix, axis_names = profiles_similarity.to_wmatrix(squared: true, symm: true)
208
+ similarity_matrix.save(matrix_filename, axis_names, axis_file)
209
+ else
210
+ axis_file_x = matrix_filename.gsub('.npy','_x.lst')
211
+ axis_file_y = matrix_filename.gsub('.npy','_y.lst')
212
+ similarity_matrix, y_names, x_names = profiles_similarity.to_wmatrix(squared: false, symm: true)
213
+ similarity_matrix.save(matrix_filename, y_names, axis_file_y, x_names, axis_file_x)
467
214
  end
468
215
  end
469
- i += 1
470
- end
471
- return matrix, element_names
472
- end
473
-
474
- def write_similarity_matrix(similarity_matrix, similarity_matrix_file)
475
- File.open(similarity_matrix_file, 'w') do |f|
476
- similarity_matrix.each do |row|
477
- f.puts row.join("\t")
216
+ ext_var = ''
217
+ if method_name == 'resnik'
218
+ ext_var = '-m max'
219
+ elsif method_name == 'lin'
220
+ ext_var = '-m comp1'
478
221
  end
479
- end
480
- end
481
-
482
- def write_profile_pairs(similarity_pairs, filename)
483
- File.open(filename, 'w') do |f|
484
- similarity_pairs.each do |pairsA, pairsB_and_values|
485
- pairsB_and_values.each do |pairsB, values|
486
- f.puts "#{pairsA}\t#{pairsB}\t#{values}"
222
+ cluster_file = "#{method_name}_clusters.txt"
223
+ if !reference_profiles.nil?
224
+ ext_var << ' -s'
225
+ axis_file = "#{axis_file_y},#{axis_file_x}"
226
+ cluster_file = "#{method_name}_clusters_rows.txt"
227
+ end
228
+ out_file = File.join(temp_folder, method_name)
229
+ system_call(EXTERNAL_CODE, 'plot_heatmap.R', "-y #{axis_file} -d #{matrix_filename} -o #{out_file} -M #{options[:minClusterProportion]} -t dynamic -H #{ext_var}") if !File.exists?(out_file + '_heatmap.png')
230
+ clusters_codes, clusters_info = parse_clusters_file(File.join(temp_folder, cluster_file), patient_data)
231
+ write_patient_hpo_stat(get_cluster_metadata(clusters_info), clusters_distribution_filename)
232
+ out_file = File.join(temp_folder, ['clusters_distribution', method_name].join('_'))
233
+ system_call(EXTERNAL_CODE, 'xyplot_graph.R', "-d #{clusters_distribution_filename} -o #{out_file} -x PatientsNumber -y HPOAverage") if !File.exists?(out_file)
234
+ sim_mat4cluster = {}
235
+ if options[:detailed_clusters]
236
+ clusters_codes.each do |cluster|
237
+ cluster_cohort = Cohort.new
238
+ clID, patient_number, patient_ids, hpo_codes = cluster
239
+ patient_ids.each_with_index {|patID, i| cluster_cohort.add_record([patID, hpo_codes[i], []])}
240
+ cluster_profiles = cluster_cohort.profiles
241
+ ref_profile = cluster_cohort.get_general_profile
242
+ hpo.load_profiles({ref: ref_profile}, reset_stored: true)
243
+ similarities = hpo.compare_profiles(external_profiles: cluster_profiles, sim_type: :lin, bidirectional: false)
244
+ candidate_sim_matrix, candidates, candidates_ids = get_similarity_matrix(ref_profile, similarities[:ref], cluster_profiles, hpo, 100, 100)
245
+ candidate_sim_matrix.unshift(['HP'] + candidates_ids)
246
+ sim_mat4cluster[clID] = candidate_sim_matrix
487
247
  end
488
248
  end
489
- end
490
- end
491
249
 
492
- def parse_clusters_file(clusters_file, patient_profiles)
493
- clusters_info = {}
494
- clusters_table = []
495
- File.open(clusters_file).each do |line|
496
- line.chomp!
497
- patientID, clusterID = line.split("\t")
498
- patientHPOProfile = patient_profiles[patientID]
499
- query = clusters_info[clusterID]
500
- if query.nil?
501
- clusters_info[clusterID] = {patientID => patientHPOProfile}
502
- else
503
- query[patientID] = patientHPOProfile
504
- end
505
- end
506
- clusters_info.each do |clusterID, patients_info|
507
- patients_per_cluster = patients_info.keys.length
508
- clusters_table << [clusterID, patients_per_cluster, patients_info.keys, patients_info.values]
250
+
251
+ clusters = translate_codes(clusters_codes, hpo)
252
+ container = {
253
+ :temp_folder => temp_folder,
254
+ :cluster_name => method_name,
255
+ :clusters => clusters,
256
+ :hpo => hpo,
257
+ :sim_mat4cluster => sim_mat4cluster
258
+ }
259
+
260
+ report = Report_html.new(container, 'Patient clusters report')
261
+ report.build(template)
262
+ report.write(options[:output_file]+"_#{method_name}_clusters.html")
263
+ system_call(EXTERNAL_CODE, 'generate_boxpot.R', "-i #{temp_folder} -m #{method_name} -o #{File.join(temp_folder, method_name + '_sim_boxplot')}") if !File.exists?(File.join(temp_folder, 'sim_boxplot.png'))
509
264
  end
510
- return clusters_table, clusters_info
511
265
  end
512
266
 
513
- def get_patient_hpo_frequency(patient_uniq_profiles, hpo_frequency_file)
514
- hpo_frequency = Hash.new(0)
515
- patient_uniq_profiles.values.each do |hpos|
516
- hpos.each do |hpo|
517
- hpo_frequency[hpo] += 1
518
- end
519
- end
520
- File.open(hpo_frequency_file, 'w') do |f|
521
- hpo_frequency.each do |hpo_code, freq|
522
- f.puts "#{hpo_code.to_s}\t#{freq}"
267
+ def invert_nested_hash(h)
268
+ new_h = {}
269
+ h.each do |k1, vals1|
270
+ vals1.each do |v1|
271
+ vals1.each do |k2, vals2|
272
+ query = new_h[k2]
273
+ if query.nil?
274
+ new_h[k2] = {k1 => vals2}
275
+ else
276
+ query[k1] = vals2
277
+ end
278
+ end
523
279
  end
524
280
  end
281
+ return new_h
525
282
  end
526
283
 
527
- def get_cluster_metadata(clusters_info, output_file)
284
+ def get_cluster_metadata(clusters_info)
528
285
  average_hp_per_pat_distribution = []
529
- tmp = []
530
286
  clusters_info.each do |cl_id, pat_info|
531
287
  hp_per_pat_in_clust = pat_info.values.map{|a| a.length}
532
288
  hp_per_pat_ave = hp_per_pat_in_clust.sum.fdiv(hp_per_pat_in_clust.length)
533
289
  average_hp_per_pat_distribution << [pat_info.length, hp_per_pat_ave]
534
- tmp << hp_per_pat_in_clust
535
- end
536
- total_clusters = clusters_info.length
537
- average_phenotypes_by_cluster = tmp.flatten.sum.fdiv(total_clusters)
538
- File.open(output_file, 'w') do |f|
539
- f.puts "#{'PatientsNumber'}\t#{'HPOAverage'}"
540
- average_hp_per_pat_distribution.each do |patient_num, ave|
541
- f.puts "#{patient_num}\t#{ave}"
542
- end
543
290
  end
291
+ return average_hp_per_pat_distribution
544
292
  end