pets 0.2.3 → 0.2.4

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,130 +1,20 @@
1
- require 'numo/narray'
2
- require 'semtools'
3
-
4
- HPOS = 0
5
- CHR = 1
6
- START = 2
7
- STOP = 3
8
-
9
- def load_hpo_ontology(hpo_file, excluded_hpo_file)
10
- hpo = nil
11
- if !hpo_file.include?('.json')
12
- if !excluded_hpo_file.nil?
13
- hpo = Ontology.new(file: hpo_file, load_file: true, removable_terms: read_excluded_hpo_file(excluded_hpo_file))
14
- else
15
- hpo = Ontology.new(file: hpo_file, load_file: true)
16
- end
17
- else
18
- hpo = Ontology.new
19
- hpo.read(hpo_file)
20
- if !excluded_hpo_file.nil?
21
- hpo.add_removable_terms(read_excluded_hpo_file(excluded_hpo_file))
22
- hpo.remove_removable()
23
- hpo.build_index()
24
- end
25
- end
26
- return hpo
27
- end
28
-
29
- def format_patient_data(patient_data, options, hpo)
30
- rejected_hpos = []
31
- rejected_patients = []
32
- patient_data.each do |pat_id, patient_record|
33
- hpos, chr, start, stop = patient_record
34
-
35
- if options[:hpo_names]
36
- hpos, pat_rejected_hpos = hpo.translate_names(hpos)
37
- if !pat_rejected_hpos.empty?
38
- STDERR.puts "WARNING: patient #{pat_id} has the unknown hpo NAMES '#{pat_rejected_hpos.join(',')}'. Rejected."
39
- rejected_hpos.concat(pat_rejected_hpos)
40
- end
41
- end
42
-
43
- hpos, pat_rejected_hpos = hpo.check_ids(hpos.map{|a| a.to_sym})
44
- if !pat_rejected_hpos.empty?
45
- STDERR.puts "WARNING: patient #{pat_id} has the unknown hpo CODES '#{pat_rejected_hpos.join(',')}'. Rejected."
46
- rejected_hpos.concat(pat_rejected_hpos)
47
- end
48
- if hpos.empty?
49
- rejected_patients << pat_id
50
- else
51
- patient_record[HPOS] = hpos
52
- end
53
- end
54
- return rejected_hpos.uniq, rejected_patients
55
- end
56
-
57
- def compute_hpo_list_and_childs(patient_data, hpo)
58
- all_hpo = []
59
- suggested_childs = {}
60
- total_terms = 0
61
- terms_with_more_specific_childs = 0
62
- patient_data.each do |pat_id, hpos|
63
- total_terms += hpos.length
64
- more_specific_childs = hpo.get_childs_table(hpos, true)
65
- terms_with_more_specific_childs += more_specific_childs.select{|hpo_record| !hpo_record.last.empty?}.length #Exclude phenotypes with no childs
66
- suggested_childs[pat_id] = more_specific_childs
67
- all_hpo.concat(hpos)
68
- end
69
- return all_hpo.uniq, suggested_childs, terms_with_more_specific_childs.fdiv(total_terms)
70
- end
71
-
72
- def clean_patient_profiles(hpo, patient_profiles)
73
- rejected_patients = []
74
- patient_profiles.each do |pat, prof|
75
- phens = hpo.clean_profile_hard(prof)
76
- if phens.empty?
77
- rejected_patients << pat
78
- else
79
- patient_profiles[pat] = phens
80
- end
81
- end
82
- patient_profiles.select!{|pat_id, patient_record| !rejected_patients.include?(pat_id)}
83
- hpo.profiles = {}
84
- hpo.load_profiles(patient_profiles)
85
-
86
- end
87
-
88
- def generate_patient_hpo_matrix(patient_data, cohort_hpos)
89
- matrix = []
90
- n = cohort_hpos.length
91
- patient_data.each do |pat_id, pat_hpos|
92
- vector = Array.new(n, 0)
93
- pat_hpos.each do |hpo|
94
- vector[cohort_hpos.index(hpo)] = 1
95
- end
96
- matrix << vector
97
- end
98
- return matrix
99
- end
100
-
101
- def generate_patient_hpo_matrix_numo(patient_data, cohort_hpos)
102
- y_names = patient_data.keys
103
- x_names = cohort_hpos
104
- x_names_indx = {}
105
- cohort_hpos.each_with_index{|hp,x| x_names_indx[hp]=x}
106
- # row (y), cols (x)
107
- matrix = Numo::DFloat.zeros(patient_data.length, cohort_hpos.length)
108
- i = 0
109
- patient_data.each do |pat_id, pat_hpos|
110
- pat_hpos.each do |hp|
111
- matrix[i, x_names_indx[hp]] = 1
112
- end
113
- i += 1
114
- end
115
- return matrix, y_names, x_names
116
- end
117
-
118
- def write_matrix_for_R(matrix, x_names, y_names, file)
119
- File.open(file, 'w') do |f|
120
- f.puts x_names.join("\t")
121
- matrix.each_with_index do |row, i|
122
- f.puts [y_names[i]].concat(row).join("\t")
123
- end
124
- end
125
- end
126
-
127
- def process_clustered_patients(options, clustered_patients, patient_uniq_profiles, patient_data, equivalence, hpo, phenotype_ic, patient_id_type) # get ic and chromosomes
1
+ require 'expcalc'
2
+ def translate_codes(clusters, hpo)
3
+ translated_clusters = []
4
+ clusters.each do |clusterID, num_of_pats, patientIDs_ary, patient_hpos_ary|
5
+ translate_codes = patient_hpos_ary.map{|patient_hpos| patient_hpos.map{|hpo_code| hpo.translate_id(hpo_code)}}
6
+ translated_clusters << [clusterID,
7
+ num_of_pats,
8
+ patientIDs_ary,
9
+ patient_hpos_ary,
10
+ translate_codes
11
+ ]
12
+ end
13
+ return translated_clusters
14
+ end
15
+
16
+ def process_dummy_clustered_patients(options, clustered_patients, patient_data, phenotype_ic) # get ic and chromosomes
17
+ ont = Cohort.get_ontology(Cohort.act_ont)
128
18
  all_ics = []
129
19
  all_lengths = []
130
20
  top_cluster_phenotypes = []
@@ -132,30 +22,9 @@ def process_clustered_patients(options, clustered_patients, patient_uniq_profile
132
22
  multi_chromosome_patients = 0
133
23
  processed_clusters = 0
134
24
  clustered_patients.sort_by{|cl_id, pat_ids| pat_ids.length }.reverse.each do |cluster_id, patient_ids|
135
- next if patient_ids.length == 1
136
- chrs = Hash.new(0)
137
- all_phens = []
138
- profile_ics = []
139
- profile_lengths = []
140
- processed_patients = []
141
- patient_ids.each do |pat_id|
142
- phenotypes = patient_uniq_profiles[pat_id]
143
- #pat_id = pat_id.gsub(/_i\d+$/,'') if patient_id_type != 'generated'
144
- processed_patients << pat_id
145
- profile_ics << get_profile_ic(phenotypes, phenotype_ic)
146
- profile_lengths << phenotypes.length
147
- if processed_clusters < options[:clusters2show_detailed_phen_data]
148
- phen_names, rejected_codes = hpo.translate_ids(phenotypes) #optional
149
- all_phens << phen_names
150
- end
151
- variants = equivalence[pat_id]
152
- variants.each do |variant|
153
- variant_data = patient_data[variant]
154
- chrs[variant_data[CHR]] += 1 if !options[:chromosome_col].nil? && variant_data[CHR] != '-'
155
- end
156
- end
157
- num_of_patients = processed_patients.length
158
- next if num_of_patients == 1 # Check that current cluster only has one patient with several mutations
25
+ num_of_patients = patient_ids.length
26
+ next if num_of_patients == 1
27
+ chrs, all_phens, profile_ics, profile_lengths = process_cluster(patient_ids, patient_data, phenotype_ic, options, ont, processed_clusters)
159
28
  top_cluster_phenotypes << all_phens if processed_clusters < options[:clusters2show_detailed_phen_data]
160
29
  all_ics << profile_ics
161
30
  all_lengths << profile_lengths
@@ -170,12 +39,29 @@ def process_clustered_patients(options, clustered_patients, patient_uniq_profile
170
39
  return all_ics, all_lengths, cluster_data_by_chromosomes, top_cluster_phenotypes, multi_chromosome_patients
171
40
  end
172
41
 
42
+ def process_cluster(patient_ids, patient_data, phenotype_ic, options, ont, processed_clusters)
43
+ chrs = Hash.new(0)
44
+ all_phens = []
45
+ profile_ics = []
46
+ profile_lengths = []
47
+ patient_ids.each do |pat_id|
48
+ phenotypes = patient_data.get_profile(pat_id)
49
+ profile_ics << get_profile_ic(phenotypes, phenotype_ic)
50
+ profile_lengths << phenotypes.length
51
+ if processed_clusters < options[:clusters2show_detailed_phen_data]
52
+ phen_names, rejected_codes = ont.translate_ids(phenotypes) #optional
53
+ all_phens << phen_names
54
+ end
55
+ patient_data.get_vars(pat_id).get_chr.each{|chr| chrs[chr] += 1} if !options[:chromosome_col].nil?
56
+ end
57
+ return chrs, all_phens, profile_ics, profile_lengths
58
+ end
59
+
173
60
  def get_profile_ic(hpo_names, phenotype_ic)
174
61
  ic = 0
175
62
  profile_length = 0
176
63
  hpo_names.each do |hpo_id|
177
64
  hpo_ic = phenotype_ic[hpo_id]
178
- # STDERR.puts phenotype_ic.inspect
179
65
  ic += hpo_ic if !hpo_ic.nil?
180
66
  profile_length += 1
181
67
  end
@@ -183,154 +69,42 @@ def get_profile_ic(hpo_names, phenotype_ic)
183
69
  return ic.fdiv(profile_length)
184
70
  end
185
71
 
186
- def write_cluster_ic_data(all_ics, profile_lengths, cluster_ic_data_file, limit)
187
- File.open(cluster_ic_data_file, 'w') do |f|
188
- f.puts %w[cluster_id ic Plen].join("\t")
189
- all_ics.each_with_index do |cluster_ics, i|
190
- break if i == limit
191
- cluster_length = cluster_ics.length
192
- cluster_ics.each_with_index do |clust_ic, j|
193
- f.puts "#{cluster_length}_#{i}\t#{clust_ic}\t#{profile_lengths[i][j]}"
194
- end
195
- end
196
- end
197
- end
198
-
199
- def write_cluster_chromosome_data(cluster_data, cluster_chromosome_data_file, limit)
200
- File.open(cluster_chromosome_data_file, 'w') do |f|
201
- f.puts %w[cluster_id chr count].join("\t")
202
- index = 0
203
- last_id = cluster_data.first.first unless cluster_data.empty?
204
- cluster_data.each do |cluster_id, patient_number, chr, count|
205
- index += 1 if cluster_id != last_id
206
- break if index == limit
207
- f.puts ["#{patient_number}_#{index}", chr, count].join("\t")
208
- last_id = cluster_id
209
- end
210
- end
211
- end
212
-
213
- def write_coverage_data(coverage_to_plot, coverage_to_plot_file)
214
- File.open(coverage_to_plot_file, 'w') do |f|
215
- coverage_to_plot.each do |chr, position, freq|
216
- f.puts "#{chr}\t#{position}\t#{freq}"
217
- end
218
- end
219
- end
220
-
221
- def get_uniq_hpo_profiles(patient_data) # To avoid duplications due to more one mutation in the same patient
222
- hpo_profiles = {}
223
- equivalence = {}
224
- patient_data.each do |variant_id, patient_rec|
225
- pat_id, count = variant_id.split('_i')
226
- hpo_profiles[pat_id] = patient_rec[HPOS]
227
- query = equivalence[pat_id]
228
- if query.nil?
229
- equivalence[pat_id] = [variant_id]
230
- else
231
- query << variant_id
232
- end
233
- end
234
- return hpo_profiles, equivalence
235
- end
236
-
237
- def get_patient_ids(patient_data) # To aviod duplications due to more one mutation in the same patient
238
- ids = []
239
- patient_data.each do |pat_id, hpos|
240
- id, count = pat_id.split('_i')
241
- ids << id
242
- end
243
- return ids.uniq
244
- end
245
-
246
- def get_summary_stats(patient_data, rejected_patients, cohort_hpos, hpo)
72
+ def get_summary_stats(patient_data, rejected_patients, hpo_stats, fraction_terms_specific_childs, rejected_hpos)
247
73
  stats = []
248
- stats << ['Unique HPO terms', cohort_hpos.length]
249
- stats << ['Cohort size', get_patient_ids(patient_data).length]
74
+ stats << ['Unique HPO terms', hpo_stats.length]
75
+ stats << ['Cohort size', patient_data.profiles.length]
250
76
  stats << ['Rejected patients by empty profile', rejected_patients.length]
251
- # stats << ['HPOs per patient (average)', hpo.get_profile_mean_length]
252
- stats << ['HPOs per patient (average)', hpo.get_profiles_mean_size]
253
- stats << ['HPO terms per patient: percentile 90', hpo.get_profile_length_at_percentile(perc=90)]
77
+ stats << ['HPOs per patient (average)', patient_data.get_profiles_mean_size]
78
+ stats << ['HPO terms per patient: percentile 90', patient_data.get_profile_length_at_percentile(perc=90)]
79
+ stats << ['Percentage of HPO with more specific children', (fraction_terms_specific_childs * 100).round(4)]
80
+ stats << ['DsI for uniq HP terms', patient_data.get_dataset_specifity_index('uniq')]
81
+ stats << ['DsI for frequency weigthed HP terms', patient_data.get_dataset_specifity_index('weigthed')]
82
+ stats << ['Number of unknown phenotypes', rejected_hpos.length]
254
83
  return stats
255
84
  end
256
85
 
257
- def cluster_patients(patient_data, cohort_hpos, matrix_file, clustered_patients_file)
86
+ def dummy_cluster_patients(patient_data, matrix_file, clust_pat_file)
258
87
  if !File.exists?(matrix_file)
259
- pat_hpo_matrix, pat_id, hp_id = generate_patient_hpo_matrix_numo(patient_data, cohort_hpos)
88
+ pat_hpo_matrix, pat_id, hp_id = patient_data.to_bmatrix
260
89
  x_axis_file = matrix_file.gsub('.npy','_x.lst')
261
- File.open(x_axis_file, 'w'){|f| f.print hp_id.join("\n") }
262
90
  y_axis_file = matrix_file.gsub('.npy','_y.lst')
263
- File.open(y_axis_file, 'w'){|f| f.print pat_id.join("\n") }
264
- Npy.save(matrix_file, pat_hpo_matrix)
91
+ pat_hpo_matrix.save(matrix_file, hp_id, x_axis_file, pat_id, y_axis_file)
265
92
  end
266
- system("#{File.join(EXTERNAL_CODE, 'get_clusters.R')} -d #{matrix_file} -o #{clustered_patients_file} -y #{matrix_file.gsub('.npy','')}") if !File.exists?(clustered_patients_file)
267
- clustered_patients = load_clustered_patients(clustered_patients_file)
93
+ system_call(EXTERNAL_CODE, 'get_clusters.R', "-d #{matrix_file} -o #{clust_pat_file} -y #{matrix_file.gsub('.npy','')}") if !File.exists?(clust_pat_file)
94
+ clustered_patients = load_clustered_patients(clust_pat_file)
268
95
  return(clustered_patients)
269
96
  end
270
97
 
271
- def get_profile_ontology_distribution_tables(hpo)
272
- ontology_levels, distribution_percentage = hpo.get_profile_ontology_distribution_tables
273
- ontology_levels.unshift(["level", "ontology", "cohort"])
274
- distribution_percentage.unshift(["level", "ontology", "weighted cohort", "uniq terms cohort"])
275
- return ontology_levels, distribution_percentage
276
- end
277
-
278
-
279
- def write_detailed_hpo_profile_evaluation(suggested_childs, detailed_profile_evaluation_file, summary_stats)
280
- CSV.open(detailed_profile_evaluation_file, "wb") do |csv|
281
- suggested_childs.each do |pat_id, suggestions|
282
- warning = nil
283
- warning = 'WARNING: Very few phenotypes' if suggestions.length < 4
284
- csv << ["PATIENT #{pat_id}", "#{warning}"]
285
- csv << ["CURRENT PHENOTYPES", "PUTATIVE MORE SPECIFIC PHENOTYPES"]
286
- suggestions.each do |parent, childs|
287
- parent_code, parent_name = parent
288
- if childs.empty?
289
- csv << ["#{parent_name} (#{parent_code})", '-']
290
- else
291
- parent_writed = false
292
- childs.each do |child_code, child_name|
293
- if !parent_writed
294
- parent_field = "#{parent_name} (#{parent_code})"
295
- parent_writed = true
296
- else
297
- parent_field = ""
298
- end
299
- csv << [parent_field, "#{child_name} (#{child_code})"]
300
- end
301
- end
302
- end
303
- csv << ["", ""]
98
+ def get_mean_size(all_sizes)
99
+ accumulated_size = 0
100
+ number = 0
101
+ all_sizes.each do |size, occurrences|
102
+ accumulated_size += size *occurrences
103
+ number += occurrences
304
104
  end
305
- end
105
+ return accumulated_size.fdiv(number)
306
106
  end
307
107
 
308
- def write_arrays4scatterplot(x_axis_value, y_axis_value, filename, x_axis_name, y_axis_name)
309
- File.open(filename, 'w') do |f|
310
- f.puts "#{x_axis_name}\t#{y_axis_name}"
311
- x_axis_value.each_with_index do |value,i|
312
- y_value = y_axis_value[i]
313
- raise("The #{i} position is not presented in y_axis_value") if y_value.nil?
314
- f.puts [value, y_value].join("\t")
315
- end
316
- end
317
- end
318
-
319
- def process_patient_data(patient_data)
320
- parsed_patient_data = {}
321
- patient_data.each do |patientID, metadata|
322
- phenotypes, chr, start, stop = metadata
323
- next if chr == '-'
324
- info = [patientID, start.to_i, stop.to_i]
325
- query = parsed_patient_data[chr]
326
- if query.nil?
327
- parsed_patient_data[chr] = [info]
328
- else
329
- query << info
330
- end
331
- end
332
- return parsed_patient_data
333
- end
334
108
 
335
109
  def get_final_coverage(raw_coverage, bin_size)
336
110
  coverage_to_plot = []
@@ -360,185 +134,113 @@ def get_sor_length_distribution(raw_coverage)
360
134
  return all_cnvs_length
361
135
  end
362
136
 
363
- def get_cnvs_length(patient_data)
364
- length_stats = Hash.new(0)
365
- patient_data.each do |pat_id, patient_record|
366
- string_hpos, chr, start, stop = patient_record
367
- length_stats[stop - start] += 1
368
- end
369
- return length_stats.to_a.sort!{|stat| stat[1] <=> stat[1] }
370
- end
371
-
372
-
373
137
  def calculate_coverage(regions_data, delete_thresold = 0)
374
138
  raw_coverage = {}
375
139
  n_regions = 0
376
140
  patients = 0
377
141
  nt = 0
378
- regions_data.each do |start, stop, chr, node|
379
- number_of_patients = node.split('.').last.to_i
142
+ regions_data.each do |start, stop, chr, reg_id|
143
+ number_of_patients = reg_id.split('.').last.to_i
380
144
  if number_of_patients <= delete_thresold
381
145
  number_of_patients = 0
382
146
  else
383
147
  n_regions += 1
384
- patients += number_of_patients
385
148
  nt += stop - start
386
149
  end
387
- coords = [start, stop, number_of_patients]
388
- query = raw_coverage[chr]
389
- if query.nil?
390
- raw_coverage[chr] = [coords]
391
- else
392
- query << coords
393
- end
150
+ add_record(raw_coverage, chr, [start, stop, number_of_patients])
151
+ patients += number_of_patients
394
152
  end
395
153
  return raw_coverage, n_regions, nt, patients.fdiv(n_regions)
396
154
  end
397
155
 
398
- def get_profile_redundancy(hpo)
399
- #TODO: sort both arrays consequently
400
- #TODO: bear in mind join both arrays with zip and sort by one, to get an [a[a]]
401
- # profile_sizes = hpo.get_profile_sizes
402
- profile_sizes = hpo.get_profiles_sizes
403
- # parental_hpos_per_profile = hpo.compute_redundant_parental_terms_per_profile
404
- parental_hpos_per_profile = hpo.parentals_per_profile# clean_profiles
405
- parental_hpos_per_profile = parental_hpos_per_profile.map{|item| item[0]}
406
- profile_sizes, parental_hpos_per_profile = profile_sizes.zip(parental_hpos_per_profile).sort_by{|i| i.first}.reverse.transpose
407
- return profile_sizes, parental_hpos_per_profile
408
- end
409
-
410
- def format_profiles_similarity_data(profiles_similarity)
411
- matrix = []
412
- element_names = profiles_similarity.keys
413
- matrix << element_names
414
- profiles_similarity.each do |elementA, relations|
415
- row = [elementA]
416
- element_names.each do |elementB|
417
- if elementA == elementB
418
- row << 'NA'
419
- else
420
- query = relations[elementB]
421
- if !query.nil?
422
- row << query
423
- else
424
- row << profiles_similarity[elementB][elementA]
425
- end
426
- end
427
- end
428
- matrix << row
429
- end
430
- matrix[0].unshift('pat')
431
- return matrix
432
- end
433
-
434
- def format_profiles_similarity_data_pairs(profiles_similarity)
435
- pairs = []
436
- element_names = profiles_similarity.keys
437
- profiles_similarity.each do |elementA, relations|
438
- element_names.each do |elementB|
439
- if elementA != elementB
440
- pair = [elementA, elementB]
441
- query = relations[elementB]
442
- if !query.nil?
443
- pair << query
444
- else
445
- pair << profiles_similarity[elementB][elementA]
446
- end
447
- pairs << pair
156
+ def get_top_dummy_clusters_stats(top_clust_phen)
157
+ new_cluster_phenotypes = {}
158
+ top_clust_phen.each_with_index do |cluster, clusterID|
159
+ phenotypes_frequency = Hash.new(0)
160
+ total_patients = cluster.length
161
+ cluster.each do |phenotypes|
162
+ phenotypes.each do |p|
163
+ phenotypes_frequency[p] += 1
448
164
  end
449
165
  end
450
- end
451
- return pairs
452
- end
453
-
454
- def format_profiles_similarity_data_numo(profiles_similarity)
455
- element_names = profiles_similarity.keys
456
- matrix = Numo::DFloat.zeros(element_names.length, element_names.length)
457
- i = 0
458
- profiles_similarity.each do |elementA, relations|
459
- element_names.each_with_index do |elementB, j|
460
- if elementA != elementB
461
- query = relations[elementB]
462
- if !query.nil?
463
- matrix[i, j] = query
464
- else
465
- matrix[i, j] = profiles_similarity[elementB][elementA]
466
- end
467
- end
468
- end
469
- i += 1
470
- end
471
- return matrix, element_names
472
- end
473
-
474
- def write_similarity_matrix(similarity_matrix, similarity_matrix_file)
475
- File.open(similarity_matrix_file, 'w') do |f|
476
- similarity_matrix.each do |row|
477
- f.puts row.join("\t")
478
- end
479
- end
480
- end
481
-
482
- def write_profile_pairs(similarity_pairs, filename)
483
- File.open(filename, 'w') do |f|
484
- similarity_pairs.each do |pairsA, pairsB_and_values|
485
- pairsB_and_values.each do |pairsB, values|
486
- f.puts "#{pairsA}\t#{pairsB}\t#{values}"
166
+ new_cluster_phenotypes[clusterID] = [total_patients, phenotypes_frequency.keys, phenotypes_frequency.values.map{|v| v.fdiv(total_patients) * 100}]
167
+ end
168
+ return new_cluster_phenotypes
169
+ end
170
+
171
+ def remove_nested_entries(nested_hash)
172
+ empty_root_ids = []
173
+ nested_hash.each do |root_id, entries|
174
+ entries.select!{|id, val| yield(id, val)}
175
+ empty_root_ids << root_id if entries.empty?
176
+ end
177
+ empty_root_ids.each{|id| nested_hash.delete(id)}
178
+ end
179
+
180
+ def get_semantic_similarity_clustering(options, patient_data, temp_folder)
181
+ template = File.open(File.join(REPORT_FOLDER, 'cluster_report.erb')).read
182
+ hpo = Cohort.get_ontology(Cohort.act_ont)
183
+ reference_profiles = nil
184
+ reference_profiles = load_profiles(options[:reference_profiles], hpo) if !options[:reference_profiles].nil?
185
+ Parallel.each(options[:clustering_methods], in_processes: options[:threads] ) do |method_name|
186
+ matrix_filename = File.join(temp_folder, "similarity_matrix_#{method_name}.npy")
187
+ profiles_similarity_filename = File.join(temp_folder, ['profiles_similarity', method_name].join('_').concat('.txt'))
188
+ clusters_distribution_filename = File.join(temp_folder, ['clusters_distribution', method_name].join('_').concat('.txt'))
189
+ if !File.exists?(matrix_filename)
190
+ profiles_similarity = patient_data.compare_profiles(sim_type: method_name.to_sym, external_profiles: reference_profiles)
191
+ remove_nested_entries(profiles_similarity){|id, sim| sim >= options[:sim_thr] } if !options[:sim_thr].nil?
192
+ write_profile_pairs(profiles_similarity, profiles_similarity_filename)
193
+ if reference_profiles.nil?
194
+ axis_file = matrix_filename.gsub('.npy','.lst')
195
+ similarity_matrix, axis_names = profiles_similarity.to_wmatrix(squared: true, symm: true)
196
+ similarity_matrix.save(matrix_filename, axis_names, axis_file)
197
+ else
198
+ axis_file_x = matrix_filename.gsub('.npy','_x.lst')
199
+ axis_file_y = matrix_filename.gsub('.npy','_y.lst')
200
+ similarity_matrix, y_names, x_names = profiles_similarity.to_wmatrix(squared: false, symm: true)
201
+ similarity_matrix.save(matrix_filename, y_names, axis_file_y, x_names, axis_file_x)
487
202
  end
488
203
  end
489
- end
490
- end
491
-
492
- def parse_clusters_file(clusters_file, patient_profiles)
493
- clusters_info = {}
494
- clusters_table = []
495
- File.open(clusters_file).each do |line|
496
- line.chomp!
497
- patientID, clusterID = line.split("\t")
498
- patientHPOProfile = patient_profiles[patientID]
499
- query = clusters_info[clusterID]
500
- if query.nil?
501
- clusters_info[clusterID] = {patientID => patientHPOProfile}
502
- else
503
- query[patientID] = patientHPOProfile
504
- end
505
- end
506
- clusters_info.each do |clusterID, patients_info|
507
- patients_per_cluster = patients_info.keys.length
508
- clusters_table << [clusterID, patients_per_cluster, patients_info.keys, patients_info.values]
509
- end
510
- return clusters_table, clusters_info
511
- end
512
-
513
- def get_patient_hpo_frequency(patient_uniq_profiles, hpo_frequency_file)
514
- hpo_frequency = Hash.new(0)
515
- patient_uniq_profiles.values.each do |hpos|
516
- hpos.each do |hpo|
517
- hpo_frequency[hpo] += 1
518
- end
519
- end
520
- File.open(hpo_frequency_file, 'w') do |f|
521
- hpo_frequency.each do |hpo_code, freq|
522
- f.puts "#{hpo_code.to_s}\t#{freq}"
523
- end
524
- end
525
- end
526
-
527
- def get_cluster_metadata(clusters_info, output_file)
204
+ ext_var = ''
205
+ if method_name == 'resnik'
206
+ ext_var = '-m max'
207
+ elsif method_name == 'lin'
208
+ ext_var = '-m comp1'
209
+ end
210
+ cluster_file = "#{method_name}_clusters.txt"
211
+ if !reference_profiles.nil?
212
+ ext_var << ' -s'
213
+ axis_file = "#{axis_file_y},#{axis_file_x}"
214
+ cluster_file = "#{method_name}_clusters_rows.txt"
215
+ end
216
+ out_file = File.join(temp_folder, method_name)
217
+ system_call(EXTERNAL_CODE, 'plot_heatmap.R', "-y #{axis_file} -d #{matrix_filename} -o #{out_file} -M #{options[:minClusterProportion]} -t dynamic -H #{ext_var}") if !File.exists?(out_file + '_heatmap.png')
218
+ clusters_codes, clusters_info = parse_clusters_file(File.join(temp_folder, cluster_file), patient_data)
219
+ write_patient_hpo_stat(get_cluster_metadata(clusters_info), clusters_distribution_filename)
220
+ out_file = File.join(temp_folder, ['clusters_distribution', method_name].join('_'))
221
+ system_call(EXTERNAL_CODE, 'xyplot_graph.R', "-d #{clusters_distribution_filename} -o #{out_file} -x PatientsNumber -y HPOAverage") if !File.exists?(out_file)
222
+ clusters = translate_codes(clusters_codes, hpo)
223
+
224
+ container = {
225
+ :temp_folder => temp_folder,
226
+ :cluster_name => method_name,
227
+ :clusters => clusters,
228
+ :hpo => hpo
229
+ }
230
+
231
+ report = Report_html.new(container, 'Patient clusters report')
232
+ report.build(template)
233
+ report.write(options[:output_file]+"_#{method_name}_clusters.html")
234
+ system_call(EXTERNAL_CODE, 'generate_boxpot.R', "-i #{temp_folder} -m #{method_name} -o #{File.join(temp_folder, method_name + '_sim_boxplot')}") if !File.exists?(File.join(temp_folder, 'sim_boxplot.png'))
235
+ end
236
+ end
237
+
238
+ def get_cluster_metadata(clusters_info)
528
239
  average_hp_per_pat_distribution = []
529
- tmp = []
530
240
  clusters_info.each do |cl_id, pat_info|
531
241
  hp_per_pat_in_clust = pat_info.values.map{|a| a.length}
532
242
  hp_per_pat_ave = hp_per_pat_in_clust.sum.fdiv(hp_per_pat_in_clust.length)
533
243
  average_hp_per_pat_distribution << [pat_info.length, hp_per_pat_ave]
534
- tmp << hp_per_pat_in_clust
535
- end
536
- total_clusters = clusters_info.length
537
- average_phenotypes_by_cluster = tmp.flatten.sum.fdiv(total_clusters)
538
- File.open(output_file, 'w') do |f|
539
- f.puts "#{'PatientsNumber'}\t#{'HPOAverage'}"
540
- average_hp_per_pat_distribution.each do |patient_num, ave|
541
- f.puts "#{patient_num}\t#{ave}"
542
- end
543
244
  end
245
+ return average_hp_per_pat_distribution
544
246
  end