pets 0.2.3 → 0.2.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,130 +1,20 @@
1
- require 'numo/narray'
2
- require 'semtools'
3
-
4
- HPOS = 0
5
- CHR = 1
6
- START = 2
7
- STOP = 3
8
-
9
- def load_hpo_ontology(hpo_file, excluded_hpo_file)
10
- hpo = nil
11
- if !hpo_file.include?('.json')
12
- if !excluded_hpo_file.nil?
13
- hpo = Ontology.new(file: hpo_file, load_file: true, removable_terms: read_excluded_hpo_file(excluded_hpo_file))
14
- else
15
- hpo = Ontology.new(file: hpo_file, load_file: true)
16
- end
17
- else
18
- hpo = Ontology.new
19
- hpo.read(hpo_file)
20
- if !excluded_hpo_file.nil?
21
- hpo.add_removable_terms(read_excluded_hpo_file(excluded_hpo_file))
22
- hpo.remove_removable()
23
- hpo.build_index()
24
- end
25
- end
26
- return hpo
27
- end
28
-
29
- def format_patient_data(patient_data, options, hpo)
30
- rejected_hpos = []
31
- rejected_patients = []
32
- patient_data.each do |pat_id, patient_record|
33
- hpos, chr, start, stop = patient_record
34
-
35
- if options[:hpo_names]
36
- hpos, pat_rejected_hpos = hpo.translate_names(hpos)
37
- if !pat_rejected_hpos.empty?
38
- STDERR.puts "WARNING: patient #{pat_id} has the unknown hpo NAMES '#{pat_rejected_hpos.join(',')}'. Rejected."
39
- rejected_hpos.concat(pat_rejected_hpos)
40
- end
41
- end
42
-
43
- hpos, pat_rejected_hpos = hpo.check_ids(hpos.map{|a| a.to_sym})
44
- if !pat_rejected_hpos.empty?
45
- STDERR.puts "WARNING: patient #{pat_id} has the unknown hpo CODES '#{pat_rejected_hpos.join(',')}'. Rejected."
46
- rejected_hpos.concat(pat_rejected_hpos)
47
- end
48
- if hpos.empty?
49
- rejected_patients << pat_id
50
- else
51
- patient_record[HPOS] = hpos
52
- end
53
- end
54
- return rejected_hpos.uniq, rejected_patients
55
- end
56
-
57
- def compute_hpo_list_and_childs(patient_data, hpo)
58
- all_hpo = []
59
- suggested_childs = {}
60
- total_terms = 0
61
- terms_with_more_specific_childs = 0
62
- patient_data.each do |pat_id, hpos|
63
- total_terms += hpos.length
64
- more_specific_childs = hpo.get_childs_table(hpos, true)
65
- terms_with_more_specific_childs += more_specific_childs.select{|hpo_record| !hpo_record.last.empty?}.length #Exclude phenotypes with no childs
66
- suggested_childs[pat_id] = more_specific_childs
67
- all_hpo.concat(hpos)
68
- end
69
- return all_hpo.uniq, suggested_childs, terms_with_more_specific_childs.fdiv(total_terms)
70
- end
71
-
72
- def clean_patient_profiles(hpo, patient_profiles)
73
- rejected_patients = []
74
- patient_profiles.each do |pat, prof|
75
- phens = hpo.clean_profile_hard(prof)
76
- if phens.empty?
77
- rejected_patients << pat
78
- else
79
- patient_profiles[pat] = phens
80
- end
81
- end
82
- patient_profiles.select!{|pat_id, patient_record| !rejected_patients.include?(pat_id)}
83
- hpo.profiles = {}
84
- hpo.load_profiles(patient_profiles)
85
-
86
- end
87
-
88
- def generate_patient_hpo_matrix(patient_data, cohort_hpos)
89
- matrix = []
90
- n = cohort_hpos.length
91
- patient_data.each do |pat_id, pat_hpos|
92
- vector = Array.new(n, 0)
93
- pat_hpos.each do |hpo|
94
- vector[cohort_hpos.index(hpo)] = 1
95
- end
96
- matrix << vector
97
- end
98
- return matrix
99
- end
100
-
101
- def generate_patient_hpo_matrix_numo(patient_data, cohort_hpos)
102
- y_names = patient_data.keys
103
- x_names = cohort_hpos
104
- x_names_indx = {}
105
- cohort_hpos.each_with_index{|hp,x| x_names_indx[hp]=x}
106
- # row (y), cols (x)
107
- matrix = Numo::DFloat.zeros(patient_data.length, cohort_hpos.length)
108
- i = 0
109
- patient_data.each do |pat_id, pat_hpos|
110
- pat_hpos.each do |hp|
111
- matrix[i, x_names_indx[hp]] = 1
112
- end
113
- i += 1
114
- end
115
- return matrix, y_names, x_names
116
- end
117
-
118
- def write_matrix_for_R(matrix, x_names, y_names, file)
119
- File.open(file, 'w') do |f|
120
- f.puts x_names.join("\t")
121
- matrix.each_with_index do |row, i|
122
- f.puts [y_names[i]].concat(row).join("\t")
123
- end
124
- end
125
- end
126
-
127
- def process_clustered_patients(options, clustered_patients, patient_uniq_profiles, patient_data, equivalence, hpo, phenotype_ic, patient_id_type) # get ic and chromosomes
1
+ require 'expcalc'
2
+ def translate_codes(clusters, hpo)
3
+ translated_clusters = []
4
+ clusters.each do |clusterID, num_of_pats, patientIDs_ary, patient_hpos_ary|
5
+ translate_codes = patient_hpos_ary.map{|patient_hpos| patient_hpos.map{|hpo_code| hpo.translate_id(hpo_code)}}
6
+ translated_clusters << [clusterID,
7
+ num_of_pats,
8
+ patientIDs_ary,
9
+ patient_hpos_ary,
10
+ translate_codes
11
+ ]
12
+ end
13
+ return translated_clusters
14
+ end
15
+
16
+ def process_dummy_clustered_patients(options, clustered_patients, patient_data, phenotype_ic) # get ic and chromosomes
17
+ ont = Cohort.get_ontology(Cohort.act_ont)
128
18
  all_ics = []
129
19
  all_lengths = []
130
20
  top_cluster_phenotypes = []
@@ -132,30 +22,9 @@ def process_clustered_patients(options, clustered_patients, patient_uniq_profile
132
22
  multi_chromosome_patients = 0
133
23
  processed_clusters = 0
134
24
  clustered_patients.sort_by{|cl_id, pat_ids| pat_ids.length }.reverse.each do |cluster_id, patient_ids|
135
- next if patient_ids.length == 1
136
- chrs = Hash.new(0)
137
- all_phens = []
138
- profile_ics = []
139
- profile_lengths = []
140
- processed_patients = []
141
- patient_ids.each do |pat_id|
142
- phenotypes = patient_uniq_profiles[pat_id]
143
- #pat_id = pat_id.gsub(/_i\d+$/,'') if patient_id_type != 'generated'
144
- processed_patients << pat_id
145
- profile_ics << get_profile_ic(phenotypes, phenotype_ic)
146
- profile_lengths << phenotypes.length
147
- if processed_clusters < options[:clusters2show_detailed_phen_data]
148
- phen_names, rejected_codes = hpo.translate_ids(phenotypes) #optional
149
- all_phens << phen_names
150
- end
151
- variants = equivalence[pat_id]
152
- variants.each do |variant|
153
- variant_data = patient_data[variant]
154
- chrs[variant_data[CHR]] += 1 if !options[:chromosome_col].nil? && variant_data[CHR] != '-'
155
- end
156
- end
157
- num_of_patients = processed_patients.length
158
- next if num_of_patients == 1 # Check that current cluster only has one patient with several mutations
25
+ num_of_patients = patient_ids.length
26
+ next if num_of_patients == 1
27
+ chrs, all_phens, profile_ics, profile_lengths = process_cluster(patient_ids, patient_data, phenotype_ic, options, ont, processed_clusters)
159
28
  top_cluster_phenotypes << all_phens if processed_clusters < options[:clusters2show_detailed_phen_data]
160
29
  all_ics << profile_ics
161
30
  all_lengths << profile_lengths
@@ -170,12 +39,29 @@ def process_clustered_patients(options, clustered_patients, patient_uniq_profile
170
39
  return all_ics, all_lengths, cluster_data_by_chromosomes, top_cluster_phenotypes, multi_chromosome_patients
171
40
  end
172
41
 
42
+ def process_cluster(patient_ids, patient_data, phenotype_ic, options, ont, processed_clusters)
43
+ chrs = Hash.new(0)
44
+ all_phens = []
45
+ profile_ics = []
46
+ profile_lengths = []
47
+ patient_ids.each do |pat_id|
48
+ phenotypes = patient_data.get_profile(pat_id)
49
+ profile_ics << get_profile_ic(phenotypes, phenotype_ic)
50
+ profile_lengths << phenotypes.length
51
+ if processed_clusters < options[:clusters2show_detailed_phen_data]
52
+ phen_names, rejected_codes = ont.translate_ids(phenotypes) #optional
53
+ all_phens << phen_names
54
+ end
55
+ patient_data.get_vars(pat_id).get_chr.each{|chr| chrs[chr] += 1} if !options[:chromosome_col].nil?
56
+ end
57
+ return chrs, all_phens, profile_ics, profile_lengths
58
+ end
59
+
173
60
  def get_profile_ic(hpo_names, phenotype_ic)
174
61
  ic = 0
175
62
  profile_length = 0
176
63
  hpo_names.each do |hpo_id|
177
64
  hpo_ic = phenotype_ic[hpo_id]
178
- # STDERR.puts phenotype_ic.inspect
179
65
  ic += hpo_ic if !hpo_ic.nil?
180
66
  profile_length += 1
181
67
  end
@@ -183,154 +69,42 @@ def get_profile_ic(hpo_names, phenotype_ic)
183
69
  return ic.fdiv(profile_length)
184
70
  end
185
71
 
186
- def write_cluster_ic_data(all_ics, profile_lengths, cluster_ic_data_file, limit)
187
- File.open(cluster_ic_data_file, 'w') do |f|
188
- f.puts %w[cluster_id ic Plen].join("\t")
189
- all_ics.each_with_index do |cluster_ics, i|
190
- break if i == limit
191
- cluster_length = cluster_ics.length
192
- cluster_ics.each_with_index do |clust_ic, j|
193
- f.puts "#{cluster_length}_#{i}\t#{clust_ic}\t#{profile_lengths[i][j]}"
194
- end
195
- end
196
- end
197
- end
198
-
199
- def write_cluster_chromosome_data(cluster_data, cluster_chromosome_data_file, limit)
200
- File.open(cluster_chromosome_data_file, 'w') do |f|
201
- f.puts %w[cluster_id chr count].join("\t")
202
- index = 0
203
- last_id = cluster_data.first.first unless cluster_data.empty?
204
- cluster_data.each do |cluster_id, patient_number, chr, count|
205
- index += 1 if cluster_id != last_id
206
- break if index == limit
207
- f.puts ["#{patient_number}_#{index}", chr, count].join("\t")
208
- last_id = cluster_id
209
- end
210
- end
211
- end
212
-
213
- def write_coverage_data(coverage_to_plot, coverage_to_plot_file)
214
- File.open(coverage_to_plot_file, 'w') do |f|
215
- coverage_to_plot.each do |chr, position, freq|
216
- f.puts "#{chr}\t#{position}\t#{freq}"
217
- end
218
- end
219
- end
220
-
221
- def get_uniq_hpo_profiles(patient_data) # To avoid duplications due to more one mutation in the same patient
222
- hpo_profiles = {}
223
- equivalence = {}
224
- patient_data.each do |variant_id, patient_rec|
225
- pat_id, count = variant_id.split('_i')
226
- hpo_profiles[pat_id] = patient_rec[HPOS]
227
- query = equivalence[pat_id]
228
- if query.nil?
229
- equivalence[pat_id] = [variant_id]
230
- else
231
- query << variant_id
232
- end
233
- end
234
- return hpo_profiles, equivalence
235
- end
236
-
237
- def get_patient_ids(patient_data) # To aviod duplications due to more one mutation in the same patient
238
- ids = []
239
- patient_data.each do |pat_id, hpos|
240
- id, count = pat_id.split('_i')
241
- ids << id
242
- end
243
- return ids.uniq
244
- end
245
-
246
- def get_summary_stats(patient_data, rejected_patients, cohort_hpos, hpo)
72
+ def get_summary_stats(patient_data, rejected_patients, hpo_stats, fraction_terms_specific_childs, rejected_hpos)
247
73
  stats = []
248
- stats << ['Unique HPO terms', cohort_hpos.length]
249
- stats << ['Cohort size', get_patient_ids(patient_data).length]
74
+ stats << ['Unique HPO terms', hpo_stats.length]
75
+ stats << ['Cohort size', patient_data.profiles.length]
250
76
  stats << ['Rejected patients by empty profile', rejected_patients.length]
251
- # stats << ['HPOs per patient (average)', hpo.get_profile_mean_length]
252
- stats << ['HPOs per patient (average)', hpo.get_profiles_mean_size]
253
- stats << ['HPO terms per patient: percentile 90', hpo.get_profile_length_at_percentile(perc=90)]
77
+ stats << ['HPOs per patient (average)', patient_data.get_profiles_mean_size]
78
+ stats << ['HPO terms per patient: percentile 90', patient_data.get_profile_length_at_percentile(perc=90)]
79
+ stats << ['Percentage of HPO with more specific children', (fraction_terms_specific_childs * 100).round(4)]
80
+ stats << ['DsI for uniq HP terms', patient_data.get_dataset_specifity_index('uniq')]
81
+ stats << ['DsI for frequency weigthed HP terms', patient_data.get_dataset_specifity_index('weigthed')]
82
+ stats << ['Number of unknown phenotypes', rejected_hpos.length]
254
83
  return stats
255
84
  end
256
85
 
257
- def cluster_patients(patient_data, cohort_hpos, matrix_file, clustered_patients_file)
86
+ def dummy_cluster_patients(patient_data, matrix_file, clust_pat_file)
258
87
  if !File.exists?(matrix_file)
259
- pat_hpo_matrix, pat_id, hp_id = generate_patient_hpo_matrix_numo(patient_data, cohort_hpos)
88
+ pat_hpo_matrix, pat_id, hp_id = patient_data.to_bmatrix
260
89
  x_axis_file = matrix_file.gsub('.npy','_x.lst')
261
- File.open(x_axis_file, 'w'){|f| f.print hp_id.join("\n") }
262
90
  y_axis_file = matrix_file.gsub('.npy','_y.lst')
263
- File.open(y_axis_file, 'w'){|f| f.print pat_id.join("\n") }
264
- Npy.save(matrix_file, pat_hpo_matrix)
91
+ pat_hpo_matrix.save(matrix_file, hp_id, x_axis_file, pat_id, y_axis_file)
265
92
  end
266
- system("#{File.join(EXTERNAL_CODE, 'get_clusters.R')} -d #{matrix_file} -o #{clustered_patients_file} -y #{matrix_file.gsub('.npy','')}") if !File.exists?(clustered_patients_file)
267
- clustered_patients = load_clustered_patients(clustered_patients_file)
93
+ system_call(EXTERNAL_CODE, 'get_clusters.R', "-d #{matrix_file} -o #{clust_pat_file} -y #{matrix_file.gsub('.npy','')}") if !File.exists?(clust_pat_file)
94
+ clustered_patients = load_clustered_patients(clust_pat_file)
268
95
  return(clustered_patients)
269
96
  end
270
97
 
271
- def get_profile_ontology_distribution_tables(hpo)
272
- ontology_levels, distribution_percentage = hpo.get_profile_ontology_distribution_tables
273
- ontology_levels.unshift(["level", "ontology", "cohort"])
274
- distribution_percentage.unshift(["level", "ontology", "weighted cohort", "uniq terms cohort"])
275
- return ontology_levels, distribution_percentage
276
- end
277
-
278
-
279
- def write_detailed_hpo_profile_evaluation(suggested_childs, detailed_profile_evaluation_file, summary_stats)
280
- CSV.open(detailed_profile_evaluation_file, "wb") do |csv|
281
- suggested_childs.each do |pat_id, suggestions|
282
- warning = nil
283
- warning = 'WARNING: Very few phenotypes' if suggestions.length < 4
284
- csv << ["PATIENT #{pat_id}", "#{warning}"]
285
- csv << ["CURRENT PHENOTYPES", "PUTATIVE MORE SPECIFIC PHENOTYPES"]
286
- suggestions.each do |parent, childs|
287
- parent_code, parent_name = parent
288
- if childs.empty?
289
- csv << ["#{parent_name} (#{parent_code})", '-']
290
- else
291
- parent_writed = false
292
- childs.each do |child_code, child_name|
293
- if !parent_writed
294
- parent_field = "#{parent_name} (#{parent_code})"
295
- parent_writed = true
296
- else
297
- parent_field = ""
298
- end
299
- csv << [parent_field, "#{child_name} (#{child_code})"]
300
- end
301
- end
302
- end
303
- csv << ["", ""]
98
+ def get_mean_size(all_sizes)
99
+ accumulated_size = 0
100
+ number = 0
101
+ all_sizes.each do |size, occurrences|
102
+ accumulated_size += size *occurrences
103
+ number += occurrences
304
104
  end
305
- end
105
+ return accumulated_size.fdiv(number)
306
106
  end
307
107
 
308
- def write_arrays4scatterplot(x_axis_value, y_axis_value, filename, x_axis_name, y_axis_name)
309
- File.open(filename, 'w') do |f|
310
- f.puts "#{x_axis_name}\t#{y_axis_name}"
311
- x_axis_value.each_with_index do |value,i|
312
- y_value = y_axis_value[i]
313
- raise("The #{i} position is not presented in y_axis_value") if y_value.nil?
314
- f.puts [value, y_value].join("\t")
315
- end
316
- end
317
- end
318
-
319
- def process_patient_data(patient_data)
320
- parsed_patient_data = {}
321
- patient_data.each do |patientID, metadata|
322
- phenotypes, chr, start, stop = metadata
323
- next if chr == '-'
324
- info = [patientID, start.to_i, stop.to_i]
325
- query = parsed_patient_data[chr]
326
- if query.nil?
327
- parsed_patient_data[chr] = [info]
328
- else
329
- query << info
330
- end
331
- end
332
- return parsed_patient_data
333
- end
334
108
 
335
109
  def get_final_coverage(raw_coverage, bin_size)
336
110
  coverage_to_plot = []
@@ -360,185 +134,113 @@ def get_sor_length_distribution(raw_coverage)
360
134
  return all_cnvs_length
361
135
  end
362
136
 
363
- def get_cnvs_length(patient_data)
364
- length_stats = Hash.new(0)
365
- patient_data.each do |pat_id, patient_record|
366
- string_hpos, chr, start, stop = patient_record
367
- length_stats[stop - start] += 1
368
- end
369
- return length_stats.to_a.sort!{|stat| stat[1] <=> stat[1] }
370
- end
371
-
372
-
373
137
  def calculate_coverage(regions_data, delete_thresold = 0)
374
138
  raw_coverage = {}
375
139
  n_regions = 0
376
140
  patients = 0
377
141
  nt = 0
378
- regions_data.each do |start, stop, chr, node|
379
- number_of_patients = node.split('.').last.to_i
142
+ regions_data.each do |start, stop, chr, reg_id|
143
+ number_of_patients = reg_id.split('.').last.to_i
380
144
  if number_of_patients <= delete_thresold
381
145
  number_of_patients = 0
382
146
  else
383
147
  n_regions += 1
384
- patients += number_of_patients
385
148
  nt += stop - start
386
149
  end
387
- coords = [start, stop, number_of_patients]
388
- query = raw_coverage[chr]
389
- if query.nil?
390
- raw_coverage[chr] = [coords]
391
- else
392
- query << coords
393
- end
150
+ add_record(raw_coverage, chr, [start, stop, number_of_patients])
151
+ patients += number_of_patients
394
152
  end
395
153
  return raw_coverage, n_regions, nt, patients.fdiv(n_regions)
396
154
  end
397
155
 
398
- def get_profile_redundancy(hpo)
399
- #TODO: sort both arrays consequently
400
- #TODO: bear in mind join both arrays with zip and sort by one, to get an [a[a]]
401
- # profile_sizes = hpo.get_profile_sizes
402
- profile_sizes = hpo.get_profiles_sizes
403
- # parental_hpos_per_profile = hpo.compute_redundant_parental_terms_per_profile
404
- parental_hpos_per_profile = hpo.parentals_per_profile# clean_profiles
405
- parental_hpos_per_profile = parental_hpos_per_profile.map{|item| item[0]}
406
- profile_sizes, parental_hpos_per_profile = profile_sizes.zip(parental_hpos_per_profile).sort_by{|i| i.first}.reverse.transpose
407
- return profile_sizes, parental_hpos_per_profile
408
- end
409
-
410
- def format_profiles_similarity_data(profiles_similarity)
411
- matrix = []
412
- element_names = profiles_similarity.keys
413
- matrix << element_names
414
- profiles_similarity.each do |elementA, relations|
415
- row = [elementA]
416
- element_names.each do |elementB|
417
- if elementA == elementB
418
- row << 'NA'
419
- else
420
- query = relations[elementB]
421
- if !query.nil?
422
- row << query
423
- else
424
- row << profiles_similarity[elementB][elementA]
425
- end
426
- end
427
- end
428
- matrix << row
429
- end
430
- matrix[0].unshift('pat')
431
- return matrix
432
- end
433
-
434
- def format_profiles_similarity_data_pairs(profiles_similarity)
435
- pairs = []
436
- element_names = profiles_similarity.keys
437
- profiles_similarity.each do |elementA, relations|
438
- element_names.each do |elementB|
439
- if elementA != elementB
440
- pair = [elementA, elementB]
441
- query = relations[elementB]
442
- if !query.nil?
443
- pair << query
444
- else
445
- pair << profiles_similarity[elementB][elementA]
446
- end
447
- pairs << pair
156
+ def get_top_dummy_clusters_stats(top_clust_phen)
157
+ new_cluster_phenotypes = {}
158
+ top_clust_phen.each_with_index do |cluster, clusterID|
159
+ phenotypes_frequency = Hash.new(0)
160
+ total_patients = cluster.length
161
+ cluster.each do |phenotypes|
162
+ phenotypes.each do |p|
163
+ phenotypes_frequency[p] += 1
448
164
  end
449
165
  end
450
- end
451
- return pairs
452
- end
453
-
454
- def format_profiles_similarity_data_numo(profiles_similarity)
455
- element_names = profiles_similarity.keys
456
- matrix = Numo::DFloat.zeros(element_names.length, element_names.length)
457
- i = 0
458
- profiles_similarity.each do |elementA, relations|
459
- element_names.each_with_index do |elementB, j|
460
- if elementA != elementB
461
- query = relations[elementB]
462
- if !query.nil?
463
- matrix[i, j] = query
464
- else
465
- matrix[i, j] = profiles_similarity[elementB][elementA]
466
- end
467
- end
468
- end
469
- i += 1
470
- end
471
- return matrix, element_names
472
- end
473
-
474
- def write_similarity_matrix(similarity_matrix, similarity_matrix_file)
475
- File.open(similarity_matrix_file, 'w') do |f|
476
- similarity_matrix.each do |row|
477
- f.puts row.join("\t")
478
- end
479
- end
480
- end
481
-
482
- def write_profile_pairs(similarity_pairs, filename)
483
- File.open(filename, 'w') do |f|
484
- similarity_pairs.each do |pairsA, pairsB_and_values|
485
- pairsB_and_values.each do |pairsB, values|
486
- f.puts "#{pairsA}\t#{pairsB}\t#{values}"
166
+ new_cluster_phenotypes[clusterID] = [total_patients, phenotypes_frequency.keys, phenotypes_frequency.values.map{|v| v.fdiv(total_patients) * 100}]
167
+ end
168
+ return new_cluster_phenotypes
169
+ end
170
+
171
+ def remove_nested_entries(nested_hash)
172
+ empty_root_ids = []
173
+ nested_hash.each do |root_id, entries|
174
+ entries.select!{|id, val| yield(id, val)}
175
+ empty_root_ids << root_id if entries.empty?
176
+ end
177
+ empty_root_ids.each{|id| nested_hash.delete(id)}
178
+ end
179
+
180
+ def get_semantic_similarity_clustering(options, patient_data, temp_folder)
181
+ template = File.open(File.join(REPORT_FOLDER, 'cluster_report.erb')).read
182
+ hpo = Cohort.get_ontology(Cohort.act_ont)
183
+ reference_profiles = nil
184
+ reference_profiles = load_profiles(options[:reference_profiles], hpo) if !options[:reference_profiles].nil?
185
+ Parallel.each(options[:clustering_methods], in_processes: options[:threads] ) do |method_name|
186
+ matrix_filename = File.join(temp_folder, "similarity_matrix_#{method_name}.npy")
187
+ profiles_similarity_filename = File.join(temp_folder, ['profiles_similarity', method_name].join('_').concat('.txt'))
188
+ clusters_distribution_filename = File.join(temp_folder, ['clusters_distribution', method_name].join('_').concat('.txt'))
189
+ if !File.exists?(matrix_filename)
190
+ profiles_similarity = patient_data.compare_profiles(sim_type: method_name.to_sym, external_profiles: reference_profiles)
191
+ remove_nested_entries(profiles_similarity){|id, sim| sim >= options[:sim_thr] } if !options[:sim_thr].nil?
192
+ write_profile_pairs(profiles_similarity, profiles_similarity_filename)
193
+ if reference_profiles.nil?
194
+ axis_file = matrix_filename.gsub('.npy','.lst')
195
+ similarity_matrix, axis_names = profiles_similarity.to_wmatrix(squared: true, symm: true)
196
+ similarity_matrix.save(matrix_filename, axis_names, axis_file)
197
+ else
198
+ axis_file_x = matrix_filename.gsub('.npy','_x.lst')
199
+ axis_file_y = matrix_filename.gsub('.npy','_y.lst')
200
+ similarity_matrix, y_names, x_names = profiles_similarity.to_wmatrix(squared: false, symm: true)
201
+ similarity_matrix.save(matrix_filename, y_names, axis_file_y, x_names, axis_file_x)
487
202
  end
488
203
  end
489
- end
490
- end
491
-
492
- def parse_clusters_file(clusters_file, patient_profiles)
493
- clusters_info = {}
494
- clusters_table = []
495
- File.open(clusters_file).each do |line|
496
- line.chomp!
497
- patientID, clusterID = line.split("\t")
498
- patientHPOProfile = patient_profiles[patientID]
499
- query = clusters_info[clusterID]
500
- if query.nil?
501
- clusters_info[clusterID] = {patientID => patientHPOProfile}
502
- else
503
- query[patientID] = patientHPOProfile
504
- end
505
- end
506
- clusters_info.each do |clusterID, patients_info|
507
- patients_per_cluster = patients_info.keys.length
508
- clusters_table << [clusterID, patients_per_cluster, patients_info.keys, patients_info.values]
509
- end
510
- return clusters_table, clusters_info
511
- end
512
-
513
- def get_patient_hpo_frequency(patient_uniq_profiles, hpo_frequency_file)
514
- hpo_frequency = Hash.new(0)
515
- patient_uniq_profiles.values.each do |hpos|
516
- hpos.each do |hpo|
517
- hpo_frequency[hpo] += 1
518
- end
519
- end
520
- File.open(hpo_frequency_file, 'w') do |f|
521
- hpo_frequency.each do |hpo_code, freq|
522
- f.puts "#{hpo_code.to_s}\t#{freq}"
523
- end
524
- end
525
- end
526
-
527
- def get_cluster_metadata(clusters_info, output_file)
204
+ ext_var = ''
205
+ if method_name == 'resnik'
206
+ ext_var = '-m max'
207
+ elsif method_name == 'lin'
208
+ ext_var = '-m comp1'
209
+ end
210
+ cluster_file = "#{method_name}_clusters.txt"
211
+ if !reference_profiles.nil?
212
+ ext_var << ' -s'
213
+ axis_file = "#{axis_file_y},#{axis_file_x}"
214
+ cluster_file = "#{method_name}_clusters_rows.txt"
215
+ end
216
+ out_file = File.join(temp_folder, method_name)
217
+ system_call(EXTERNAL_CODE, 'plot_heatmap.R', "-y #{axis_file} -d #{matrix_filename} -o #{out_file} -M #{options[:minClusterProportion]} -t dynamic -H #{ext_var}") if !File.exists?(out_file + '_heatmap.png')
218
+ clusters_codes, clusters_info = parse_clusters_file(File.join(temp_folder, cluster_file), patient_data)
219
+ write_patient_hpo_stat(get_cluster_metadata(clusters_info), clusters_distribution_filename)
220
+ out_file = File.join(temp_folder, ['clusters_distribution', method_name].join('_'))
221
+ system_call(EXTERNAL_CODE, 'xyplot_graph.R', "-d #{clusters_distribution_filename} -o #{out_file} -x PatientsNumber -y HPOAverage") if !File.exists?(out_file)
222
+ clusters = translate_codes(clusters_codes, hpo)
223
+
224
+ container = {
225
+ :temp_folder => temp_folder,
226
+ :cluster_name => method_name,
227
+ :clusters => clusters,
228
+ :hpo => hpo
229
+ }
230
+
231
+ report = Report_html.new(container, 'Patient clusters report')
232
+ report.build(template)
233
+ report.write(options[:output_file]+"_#{method_name}_clusters.html")
234
+ system_call(EXTERNAL_CODE, 'generate_boxpot.R', "-i #{temp_folder} -m #{method_name} -o #{File.join(temp_folder, method_name + '_sim_boxplot')}") if !File.exists?(File.join(temp_folder, 'sim_boxplot.png'))
235
+ end
236
+ end
237
+
238
+ def get_cluster_metadata(clusters_info)
528
239
  average_hp_per_pat_distribution = []
529
- tmp = []
530
240
  clusters_info.each do |cl_id, pat_info|
531
241
  hp_per_pat_in_clust = pat_info.values.map{|a| a.length}
532
242
  hp_per_pat_ave = hp_per_pat_in_clust.sum.fdiv(hp_per_pat_in_clust.length)
533
243
  average_hp_per_pat_distribution << [pat_info.length, hp_per_pat_ave]
534
- tmp << hp_per_pat_in_clust
535
- end
536
- total_clusters = clusters_info.length
537
- average_phenotypes_by_cluster = tmp.flatten.sum.fdiv(total_clusters)
538
- File.open(output_file, 'w') do |f|
539
- f.puts "#{'PatientsNumber'}\t#{'HPOAverage'}"
540
- average_hp_per_pat_distribution.each do |patient_num, ave|
541
- f.puts "#{patient_num}\t#{ave}"
542
- end
543
244
  end
245
+ return average_hp_per_pat_distribution
544
246
  end