RubyGems - pets - Versions diffs - 0.2.3 → 0.2.4 - Mend

pets 0.2.3 → 0.2.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (37) hide show

checksums.yaml +4 -4
data/Gemfile +2 -0
data/README.md +79 -5
data/bin/coPatReporter.rb +63 -156
data/bin/comPatMondo.rb +1 -4
data/bin/evidence_profiler.rb +38 -151
data/bin/get_network_nodes.rb +79 -132
data/bin/get_sorted_profs.rb +25 -36
data/bin/install_deps.rb +7 -0
data/bin/paco_translator.rb +29 -72
data/bin/phen2reg.rb +1 -4
data/bin/profiles2phenopacket.rb +110 -0
data/bin/reg2phen.rb +1 -3
data/example_datasets/associations_file.txt +757 -0
data/example_datasets/example_patient.txt +6 -0
data/example_datasets/example_patient_hpos.txt +15 -0
data/example_datasets/genes.txt +8 -0
data/example_datasets/hpo2ci.txt +2798 -0
data/example_datasets/hummu_congenital_full_dataset.txt +4183 -0
data/example_datasets/launch.sh +20 -0
data/external_code/generate_boxpot.R +51 -21
data/external_code/get_clusters.R +2 -2
data/external_code/install_R_dependencies.R +11 -0
data/external_code/plot_heatmap.R +34 -30
data/lib/pets/coPatReporterMethods.rb +143 -441
data/lib/pets/cohort.rb +307 -0
data/lib/pets/constants.rb +7 -0
data/lib/pets/generalMethods.rb +8 -317
data/lib/pets/genomic_features.rb +144 -0
data/lib/pets/io.rb +457 -0
data/lib/pets/parsers/cohort_parser.rb +106 -0
data/lib/pets/version.rb +1 -1
data/lib/pets.rb +8 -0
data/pets.gemspec +1 -0
data/templates/cohort_report.erb +5 -7
data/templates/patient_report.erb +1 -1
metadata +34 -3

data/lib/pets/coPatReporterMethods.rb CHANGED Viewed

@@ -1,130 +1,20 @@
-require 'numo/narray'
-require 'semtools'
-HPOS = 0
-CHR = 1
-START = 2
-STOP = 3
-def load_hpo_ontology(hpo_file, excluded_hpo_file)
-  hpo = nil
-  if !hpo_file.include?('.json')
-    if !excluded_hpo_file.nil?
-      hpo = Ontology.new(file: hpo_file, load_file: true, removable_terms: read_excluded_hpo_file(excluded_hpo_file))
-    else
-      hpo = Ontology.new(file: hpo_file, load_file: true)
-    end
-  else
-    hpo = Ontology.new
-    hpo.read(hpo_file)
-    if !excluded_hpo_file.nil?
-      hpo.add_removable_terms(read_excluded_hpo_file(excluded_hpo_file))
-      hpo.remove_removable()
-      hpo.build_index()
-    end
-  end
-  return hpo
-end
-def format_patient_data(patient_data, options, hpo)
-  rejected_hpos = []
-  rejected_patients = []
-  patient_data.each do |pat_id, patient_record|
-    hpos, chr, start, stop = patient_record
-    if options[:hpo_names]
-      hpos, pat_rejected_hpos = hpo.translate_names(hpos)
-      if !pat_rejected_hpos.empty?
-        STDERR.puts "WARNING: patient #{pat_id} has the unknown hpo NAMES '#{pat_rejected_hpos.join(',')}'. Rejected."
-        rejected_hpos.concat(pat_rejected_hpos)
-      end
-    end
-    hpos, pat_rejected_hpos = hpo.check_ids(hpos.map{|a| a.to_sym})
-    if !pat_rejected_hpos.empty?
-      STDERR.puts "WARNING: patient #{pat_id} has the unknown hpo CODES '#{pat_rejected_hpos.join(',')}'. Rejected."
-      rejected_hpos.concat(pat_rejected_hpos)
-    end
-    if hpos.empty?
-      rejected_patients << pat_id
-    else
-      patient_record[HPOS] = hpos
-    end
-  end
-  return rejected_hpos.uniq, rejected_patients
-end
-def compute_hpo_list_and_childs(patient_data, hpo)
-  all_hpo = []
-  suggested_childs = {}
-  total_terms = 0
-  terms_with_more_specific_childs = 0
-  patient_data.each do |pat_id, hpos|
-    total_terms += hpos.length
-    more_specific_childs = hpo.get_childs_table(hpos, true)
-    terms_with_more_specific_childs += more_specific_childs.select{|hpo_record| !hpo_record.last.empty?}.length #Exclude phenotypes with no childs
-    suggested_childs[pat_id] = more_specific_childs
-    all_hpo.concat(hpos)
-  end
-  return all_hpo.uniq, suggested_childs, terms_with_more_specific_childs.fdiv(total_terms)
-end
-def clean_patient_profiles(hpo, patient_profiles)
-  rejected_patients = []
-  patient_profiles.each do |pat, prof|
-    phens = hpo.clean_profile_hard(prof)
-    if phens.empty?
-      rejected_patients << pat
-    else
-      patient_profiles[pat] = phens
-    end
-  end
-  patient_profiles.select!{|pat_id, patient_record| !rejected_patients.include?(pat_id)}
-  hpo.profiles = {}
-  hpo.load_profiles(patient_profiles)
-end
-def generate_patient_hpo_matrix(patient_data, cohort_hpos)
-  matrix = []
-  n = cohort_hpos.length
-  patient_data.each do |pat_id, pat_hpos|
-    vector = Array.new(n, 0)
-    pat_hpos.each do |hpo|
-      vector[cohort_hpos.index(hpo)] = 1
-    end
-    matrix << vector
-  end
-  return matrix
-end
-def generate_patient_hpo_matrix_numo(patient_data, cohort_hpos)
-  y_names = patient_data.keys
-  x_names = cohort_hpos
-  x_names_indx = {}
-  cohort_hpos.each_with_index{|hp,x| x_names_indx[hp]=x}
-  # row (y), cols (x)
-  matrix = Numo::DFloat.zeros(patient_data.length, cohort_hpos.length)
-  i = 0
-  patient_data.each do |pat_id, pat_hpos|
-    pat_hpos.each do |hp|
-      matrix[i, x_names_indx[hp]] = 1
-    end
-    i += 1
-  end
-  return matrix, y_names, x_names
-end
-def write_matrix_for_R(matrix, x_names, y_names, file)
-  File.open(file, 'w') do |f|
-    f.puts x_names.join("\t")
-    matrix.each_with_index do |row, i|
-      f.puts [y_names[i]].concat(row).join("\t")
-    end
-  end
-end
-def process_clustered_patients(options, clustered_patients, patient_uniq_profiles, patient_data, equivalence, hpo, phenotype_ic, patient_id_type) # get ic and chromosomes
+require 'expcalc'
+def translate_codes(clusters, hpo)
+  translated_clusters = []
+  clusters.each do |clusterID, num_of_pats, patientIDs_ary, patient_hpos_ary|
+        translate_codes = patient_hpos_ary.map{|patient_hpos| patient_hpos.map{|hpo_code| hpo.translate_id(hpo_code)}}
+        translated_clusters << [clusterID,
+          num_of_pats,
+          patientIDs_ary,
+          patient_hpos_ary,
+          translate_codes
+        ]
+  end
+  return translated_clusters
+end
+def process_dummy_clustered_patients(options, clustered_patients, patient_data, phenotype_ic) # get ic and chromosomes
+  ont = Cohort.get_ontology(Cohort.act_ont)
   all_ics = []
   all_lengths = []
   top_cluster_phenotypes = []
@@ -132,30 +22,9 @@ def process_clustered_patients(options, clustered_patients, patient_uniq_profile
   multi_chromosome_patients = 0
   processed_clusters = 0
   clustered_patients.sort_by{|cl_id, pat_ids| pat_ids.length }.reverse.each do |cluster_id, patient_ids|
-    next if patient_ids.length == 1
-    chrs = Hash.new(0)
-    all_phens = []
-    profile_ics = []
-    profile_lengths = []
-    processed_patients = []
-    patient_ids.each do |pat_id|
-      phenotypes = patient_uniq_profiles[pat_id]
-      #pat_id = pat_id.gsub(/_i\d+$/,'') if patient_id_type != 'generated'
-      processed_patients << pat_id
-      profile_ics << get_profile_ic(phenotypes, phenotype_ic)
-      profile_lengths << phenotypes.length
-      if processed_clusters < options[:clusters2show_detailed_phen_data]
-        phen_names, rejected_codes = hpo.translate_ids(phenotypes) #optional
-        all_phens << phen_names
-      end
-      variants = equivalence[pat_id]
-      variants.each do |variant|
-        variant_data = patient_data[variant]
-        chrs[variant_data[CHR]] += 1 if !options[:chromosome_col].nil? && variant_data[CHR] != '-'
-      end
-    end
-    num_of_patients = processed_patients.length
-    next if num_of_patients == 1 # Check that current cluster only has one patient with several mutations
+    num_of_patients = patient_ids.length
+    next if num_of_patients == 1
+    chrs, all_phens, profile_ics, profile_lengths = process_cluster(patient_ids, patient_data, phenotype_ic, options, ont, processed_clusters)
     top_cluster_phenotypes << all_phens if processed_clusters < options[:clusters2show_detailed_phen_data]
     all_ics << profile_ics
     all_lengths << profile_lengths
@@ -170,12 +39,29 @@ def process_clustered_patients(options, clustered_patients, patient_uniq_profile
   return all_ics, all_lengths, cluster_data_by_chromosomes, top_cluster_phenotypes, multi_chromosome_patients
 end
+def process_cluster(patient_ids, patient_data, phenotype_ic, options, ont, processed_clusters)
+  chrs = Hash.new(0)
+  all_phens = []
+  profile_ics = []
+  profile_lengths = []
+  patient_ids.each do |pat_id|
+    phenotypes = patient_data.get_profile(pat_id)
+    profile_ics << get_profile_ic(phenotypes, phenotype_ic)
+    profile_lengths << phenotypes.length
+    if processed_clusters < options[:clusters2show_detailed_phen_data]
+      phen_names, rejected_codes = ont.translate_ids(phenotypes) #optional
+      all_phens << phen_names
+    end
+    patient_data.get_vars(pat_id).get_chr.each{|chr| chrs[chr] += 1} if !options[:chromosome_col].nil?
+  end
+  return chrs, all_phens, profile_ics, profile_lengths
+end
 def get_profile_ic(hpo_names, phenotype_ic)
   ic = 0
   profile_length = 0
   hpo_names.each do |hpo_id|
     hpo_ic = phenotype_ic[hpo_id]
-    # STDERR.puts phenotype_ic.inspect
     ic += hpo_ic if !hpo_ic.nil?
     profile_length += 1
   end
@@ -183,154 +69,42 @@ def get_profile_ic(hpo_names, phenotype_ic)
   return ic.fdiv(profile_length)
 end
-def write_cluster_ic_data(all_ics, profile_lengths, cluster_ic_data_file, limit)
-  File.open(cluster_ic_data_file, 'w') do |f|
-    f.puts %w[cluster_id ic Plen].join("\t")
-    all_ics.each_with_index do |cluster_ics, i|
-      break if i == limit
-      cluster_length = cluster_ics.length
-      cluster_ics.each_with_index do |clust_ic, j|
-        f.puts "#{cluster_length}_#{i}\t#{clust_ic}\t#{profile_lengths[i][j]}"
-      end
-    end
-  end
-end
-def write_cluster_chromosome_data(cluster_data, cluster_chromosome_data_file, limit)
-  File.open(cluster_chromosome_data_file, 'w') do |f|
-    f.puts %w[cluster_id chr count].join("\t")
-    index = 0
-    last_id = cluster_data.first.first unless cluster_data.empty?
-    cluster_data.each do |cluster_id, patient_number, chr, count|
-      index += 1 if cluster_id != last_id
-      break if index == limit
-      f.puts ["#{patient_number}_#{index}", chr, count].join("\t")
-      last_id = cluster_id
-    end
-  end
-end
-def write_coverage_data(coverage_to_plot, coverage_to_plot_file)
-  File.open(coverage_to_plot_file, 'w') do |f|
-    coverage_to_plot.each do |chr, position, freq|
-     f.puts "#{chr}\t#{position}\t#{freq}"
-   end
-  end
-end
-def get_uniq_hpo_profiles(patient_data) # To avoid duplications due to more one mutation in the same patient
-  hpo_profiles = {}
-  equivalence = {}
-  patient_data.each do |variant_id, patient_rec|
-    pat_id, count = variant_id.split('_i')
-    hpo_profiles[pat_id] = patient_rec[HPOS]
-    query =  equivalence[pat_id]
-    if query.nil?
-      equivalence[pat_id] = [variant_id]
-    else
-      query << variant_id
-    end
-  end
-  return hpo_profiles, equivalence
-end
-def get_patient_ids(patient_data) # To aviod duplications due to more one mutation in the same patient
-  ids = []
-  patient_data.each do |pat_id, hpos|
-    id, count = pat_id.split('_i')
-    ids << id
-  end
-  return  ids.uniq
-end
-def get_summary_stats(patient_data, rejected_patients, cohort_hpos, hpo)
+def get_summary_stats(patient_data, rejected_patients, hpo_stats, fraction_terms_specific_childs, rejected_hpos)
   stats = []
-  stats << ['Unique HPO terms', cohort_hpos.length]
-  stats << ['Cohort size', get_patient_ids(patient_data).length]
+  stats << ['Unique HPO terms', hpo_stats.length]
+  stats << ['Cohort size', patient_data.profiles.length]
   stats << ['Rejected patients by empty profile', rejected_patients.length]
-  # stats << ['HPOs per patient (average)', hpo.get_profile_mean_length]
-  stats << ['HPOs per patient (average)', hpo.get_profiles_mean_size]
-  stats << ['HPO terms per patient: percentile 90', hpo.get_profile_length_at_percentile(perc=90)]
+  stats << ['HPOs per patient (average)', patient_data.get_profiles_mean_size]
+  stats << ['HPO terms per patient: percentile 90', patient_data.get_profile_length_at_percentile(perc=90)]
+  stats << ['Percentage of HPO with more specific children', (fraction_terms_specific_childs * 100).round(4)]
+  stats << ['DsI for uniq HP terms', patient_data.get_dataset_specifity_index('uniq')]
+  stats << ['DsI for frequency weigthed HP terms', patient_data.get_dataset_specifity_index('weigthed')]
+  stats << ['Number of unknown phenotypes', rejected_hpos.length]
   return stats
 end
-def cluster_patients(patient_data, cohort_hpos, matrix_file, clustered_patients_file)
+def dummy_cluster_patients(patient_data, matrix_file, clust_pat_file)
   if !File.exists?(matrix_file)
-    pat_hpo_matrix, pat_id, hp_id  = generate_patient_hpo_matrix_numo(patient_data, cohort_hpos)
+    pat_hpo_matrix, pat_id, hp_id  = patient_data.to_bmatrix
     x_axis_file = matrix_file.gsub('.npy','_x.lst')
-    File.open(x_axis_file, 'w'){|f| f.print hp_id.join("\n") }
     y_axis_file = matrix_file.gsub('.npy','_y.lst')
-    File.open(y_axis_file, 'w'){|f| f.print pat_id.join("\n") }
-    Npy.save(matrix_file, pat_hpo_matrix)
+    pat_hpo_matrix.save(matrix_file, hp_id, x_axis_file, pat_id, y_axis_file)
   end
-  system("#{File.join(EXTERNAL_CODE, 'get_clusters.R')} -d #{matrix_file} -o #{clustered_patients_file} -y #{matrix_file.gsub('.npy','')}") if !File.exists?(clustered_patients_file)
-  clustered_patients = load_clustered_patients(clustered_patients_file)
+  system_call(EXTERNAL_CODE, 'get_clusters.R', "-d #{matrix_file} -o #{clust_pat_file} -y #{matrix_file.gsub('.npy','')}") if !File.exists?(clust_pat_file)
+  clustered_patients = load_clustered_patients(clust_pat_file)
   return(clustered_patients)
 end
-def get_profile_ontology_distribution_tables(hpo)
-  ontology_levels, distribution_percentage = hpo.get_profile_ontology_distribution_tables
-  ontology_levels.unshift(["level", "ontology", "cohort"])
-  distribution_percentage.unshift(["level", "ontology", "weighted cohort", "uniq terms cohort"])
-  return ontology_levels, distribution_percentage
-end
-def write_detailed_hpo_profile_evaluation(suggested_childs, detailed_profile_evaluation_file, summary_stats)
-  CSV.open(detailed_profile_evaluation_file, "wb") do |csv|
-    suggested_childs.each do |pat_id, suggestions|
-      warning = nil
-      warning = 'WARNING: Very few phenotypes' if suggestions.length < 4
-      csv << ["PATIENT #{pat_id}", "#{warning}"]
-      csv << ["CURRENT PHENOTYPES", "PUTATIVE MORE SPECIFIC PHENOTYPES"]
-      suggestions.each do |parent, childs|
-        parent_code, parent_name = parent
-        if childs.empty?
-          csv << ["#{parent_name} (#{parent_code})", '-']
-        else
-          parent_writed = false
-          childs.each do |child_code, child_name|
-            if !parent_writed
-              parent_field = "#{parent_name} (#{parent_code})"
-              parent_writed = true
-            else
-              parent_field = ""
-            end
-            csv << [parent_field, "#{child_name} (#{child_code})"]
-          end
-        end
-      end
-      csv << ["", ""]
+def get_mean_size(all_sizes)
+    accumulated_size = 0
+    number = 0
+    all_sizes.each do |size, occurrences|
+      accumulated_size += size *occurrences
+      number += occurrences
     end
-  end
+    return accumulated_size.fdiv(number)
 end
-def write_arrays4scatterplot(x_axis_value, y_axis_value, filename, x_axis_name, y_axis_name)
-  File.open(filename, 'w') do |f|
-    f.puts "#{x_axis_name}\t#{y_axis_name}"
-    x_axis_value.each_with_index do |value,i|
-      y_value = y_axis_value[i]
-      raise("The #{i} position is not presented in y_axis_value") if y_value.nil?
-      f.puts [value, y_value].join("\t")
-    end
-  end
-end
-def process_patient_data(patient_data)
-	parsed_patient_data = {}
-	patient_data.each do |patientID, metadata|
-		phenotypes, chr, start, stop = metadata
-    next if chr == '-'
-		info = [patientID, start.to_i, stop.to_i]
-		query = parsed_patient_data[chr]
-		if query.nil?
-			parsed_patient_data[chr] = [info]
-		else
-			query << info
-		end
-	end
-	return parsed_patient_data
-end
 def get_final_coverage(raw_coverage, bin_size)
 	coverage_to_plot = []
@@ -360,185 +134,113 @@ def get_sor_length_distribution(raw_coverage)
 	return all_cnvs_length
 end
-def get_cnvs_length(patient_data)
-	length_stats = Hash.new(0)
-	patient_data.each do |pat_id, patient_record|
-    	string_hpos, chr, start, stop = patient_record
-    	length_stats[stop - start] += 1
-    end
-    return length_stats.to_a.sort!{|stat| stat[1] <=> stat[1] }
-end
 def calculate_coverage(regions_data, delete_thresold = 0)
 	raw_coverage = {}
 	n_regions = 0
 	patients = 0
 	nt = 0
-	regions_data.each do |start, stop, chr, node|
-		number_of_patients = node.split('.').last.to_i
+	regions_data.each do |start, stop, chr, reg_id|
+		number_of_patients = reg_id.split('.').last.to_i
 		if number_of_patients <= delete_thresold
 			number_of_patients = 0
 		else
 			n_regions += 1
-			patients += number_of_patients
 			nt += stop - start
 		end
-		coords = [start, stop, number_of_patients]
-		query = raw_coverage[chr]
-		if query.nil?
-			raw_coverage[chr] = [coords]
-		else
-			query << coords
-		end
+    add_record(raw_coverage, chr, [start, stop, number_of_patients])
+		patients += number_of_patients
 	end
 	return raw_coverage, n_regions, nt, patients.fdiv(n_regions)
 end
-def get_profile_redundancy(hpo)
-  #TODO: sort both arrays consequently
-  #TODO: bear in mind join both arrays with zip and sort by one, to get an [a[a]]
-  # profile_sizes = hpo.get_profile_sizes
-  profile_sizes = hpo.get_profiles_sizes
-  # parental_hpos_per_profile = hpo.compute_redundant_parental_terms_per_profile
-  parental_hpos_per_profile = hpo.parentals_per_profile# clean_profiles
-  parental_hpos_per_profile = parental_hpos_per_profile.map{|item| item[0]}
-  profile_sizes, parental_hpos_per_profile = profile_sizes.zip(parental_hpos_per_profile).sort_by{|i| i.first}.reverse.transpose
-  return profile_sizes, parental_hpos_per_profile
-end
-def format_profiles_similarity_data(profiles_similarity)
-  matrix = []
-  element_names = profiles_similarity.keys
-  matrix << element_names
-  profiles_similarity.each do |elementA, relations|
-    row = [elementA]
-    element_names.each do |elementB|
-      if elementA == elementB
-        row << 'NA'
-      else
-        query = relations[elementB]
-        if !query.nil?
-          row << query
-        else
-          row << profiles_similarity[elementB][elementA]
-        end
-      end
-    end
-    matrix << row
-  end
-  matrix[0].unshift('pat')
-  return matrix
-end
-def format_profiles_similarity_data_pairs(profiles_similarity)
-  pairs = []
-  element_names = profiles_similarity.keys
-  profiles_similarity.each do |elementA, relations|
-    element_names.each do |elementB|
-      if elementA != elementB
-        pair = [elementA, elementB]
-        query = relations[elementB]
-        if !query.nil?
-          pair << query
-        else
-          pair << profiles_similarity[elementB][elementA]
-        end
-        pairs << pair
+def get_top_dummy_clusters_stats(top_clust_phen)
+  new_cluster_phenotypes = {}
+  top_clust_phen.each_with_index do |cluster, clusterID|
+    phenotypes_frequency = Hash.new(0)
+    total_patients = cluster.length
+    cluster.each do |phenotypes|
+      phenotypes.each do |p|
+        phenotypes_frequency[p] += 1
       end
     end
-  end
-  return pairs
-end
-def format_profiles_similarity_data_numo(profiles_similarity)
-  element_names = profiles_similarity.keys
-  matrix = Numo::DFloat.zeros(element_names.length, element_names.length)
-  i = 0
-  profiles_similarity.each do |elementA, relations|
-    element_names.each_with_index do |elementB, j|
-      if elementA != elementB
-        query = relations[elementB]
-        if !query.nil?
-          matrix[i, j] = query
-        else
-          matrix[i, j] = profiles_similarity[elementB][elementA]
-        end
-      end
-    end
-    i += 1
-  end
-  return matrix, element_names
-end
-def write_similarity_matrix(similarity_matrix, similarity_matrix_file)
-  File.open(similarity_matrix_file, 'w') do |f|
-    similarity_matrix.each do |row|
-      f.puts row.join("\t")
-    end
-  end
-end
-def write_profile_pairs(similarity_pairs, filename)
-  File.open(filename, 'w') do |f|
-    similarity_pairs.each do |pairsA, pairsB_and_values|
-      pairsB_and_values.each do |pairsB, values|
-        f.puts "#{pairsA}\t#{pairsB}\t#{values}"
+    new_cluster_phenotypes[clusterID] = [total_patients, phenotypes_frequency.keys, phenotypes_frequency.values.map{|v| v.fdiv(total_patients) * 100}]
+  end
+  return new_cluster_phenotypes
+end
+def remove_nested_entries(nested_hash)
+  empty_root_ids = []
+  nested_hash.each do |root_id, entries|
+    entries.select!{|id, val| yield(id, val)}
+    empty_root_ids << root_id if entries.empty?
+  end
+  empty_root_ids.each{|id| nested_hash.delete(id)}
+end
+def get_semantic_similarity_clustering(options, patient_data, temp_folder)
+  template = File.open(File.join(REPORT_FOLDER, 'cluster_report.erb')).read
+  hpo = Cohort.get_ontology(Cohort.act_ont)
+  reference_profiles = nil
+  reference_profiles = load_profiles(options[:reference_profiles], hpo) if !options[:reference_profiles].nil?
+  Parallel.each(options[:clustering_methods], in_processes: options[:threads] ) do |method_name|
+    matrix_filename = File.join(temp_folder, "similarity_matrix_#{method_name}.npy")
+    profiles_similarity_filename = File.join(temp_folder, ['profiles_similarity', method_name].join('_').concat('.txt'))
+    clusters_distribution_filename = File.join(temp_folder, ['clusters_distribution', method_name].join('_').concat('.txt'))
+    if !File.exists?(matrix_filename)
+      profiles_similarity = patient_data.compare_profiles(sim_type: method_name.to_sym, external_profiles: reference_profiles)
+      remove_nested_entries(profiles_similarity){|id, sim| sim >= options[:sim_thr] } if !options[:sim_thr].nil?
+      write_profile_pairs(profiles_similarity, profiles_similarity_filename)
+      if reference_profiles.nil?
+        axis_file = matrix_filename.gsub('.npy','.lst')
+        similarity_matrix, axis_names = profiles_similarity.to_wmatrix(squared: true, symm: true)
+        similarity_matrix.save(matrix_filename, axis_names, axis_file)
+      else
+        axis_file_x = matrix_filename.gsub('.npy','_x.lst')
+        axis_file_y = matrix_filename.gsub('.npy','_y.lst')
+        similarity_matrix, y_names, x_names = profiles_similarity.to_wmatrix(squared: false, symm: true)
+        similarity_matrix.save(matrix_filename, y_names, axis_file_y, x_names, axis_file_x)
       end
     end
-  end
-end
-def parse_clusters_file(clusters_file, patient_profiles)
-  clusters_info = {}
-  clusters_table = []
-  File.open(clusters_file).each do |line|
-    line.chomp!
-    patientID, clusterID = line.split("\t")
-    patientHPOProfile = patient_profiles[patientID]
-    query = clusters_info[clusterID]
-    if query.nil?
-      clusters_info[clusterID] = {patientID => patientHPOProfile}
-    else
-      query[patientID] = patientHPOProfile
-    end
-  end
-  clusters_info.each do |clusterID, patients_info|
-    patients_per_cluster = patients_info.keys.length
-    clusters_table << [clusterID, patients_per_cluster, patients_info.keys, patients_info.values]
-  end
-  return clusters_table, clusters_info
-end
-def get_patient_hpo_frequency(patient_uniq_profiles, hpo_frequency_file)
-  hpo_frequency = Hash.new(0)
-  patient_uniq_profiles.values.each do |hpos|
-    hpos.each do |hpo|
-      hpo_frequency[hpo] += 1
-    end
-  end
-  File.open(hpo_frequency_file, 'w') do |f|
-    hpo_frequency.each do |hpo_code, freq|
-      f.puts "#{hpo_code.to_s}\t#{freq}"
-    end
-  end
-end
-def get_cluster_metadata(clusters_info, output_file)
+    ext_var = ''
+    if method_name == 'resnik'
+      ext_var = '-m max'
+    elsif method_name == 'lin'
+      ext_var = '-m comp1'
+    end
+    cluster_file = "#{method_name}_clusters.txt"
+    if !reference_profiles.nil?
+      ext_var << ' -s'
+      axis_file = "#{axis_file_y},#{axis_file_x}"
+      cluster_file = "#{method_name}_clusters_rows.txt"
+    end
+    out_file = File.join(temp_folder, method_name)
+    system_call(EXTERNAL_CODE, 'plot_heatmap.R', "-y #{axis_file} -d #{matrix_filename} -o #{out_file} -M #{options[:minClusterProportion]} -t dynamic -H #{ext_var}") if !File.exists?(out_file +  '_heatmap.png')
+    clusters_codes, clusters_info = parse_clusters_file(File.join(temp_folder, cluster_file), patient_data)
+    write_patient_hpo_stat(get_cluster_metadata(clusters_info), clusters_distribution_filename)
+    out_file = File.join(temp_folder, ['clusters_distribution', method_name].join('_'))
+    system_call(EXTERNAL_CODE, 'xyplot_graph.R', "-d #{clusters_distribution_filename} -o #{out_file} -x PatientsNumber -y HPOAverage") if !File.exists?(out_file)
+    clusters = translate_codes(clusters_codes, hpo)
+    container = {
+      :temp_folder => temp_folder,
+      :cluster_name => method_name,
+      :clusters => clusters,
+      :hpo => hpo
+     }
+    report = Report_html.new(container, 'Patient clusters report')
+    report.build(template)
+    report.write(options[:output_file]+"_#{method_name}_clusters.html")
+    system_call(EXTERNAL_CODE, 'generate_boxpot.R', "-i #{temp_folder} -m #{method_name} -o #{File.join(temp_folder, method_name + '_sim_boxplot')}") if !File.exists?(File.join(temp_folder, 'sim_boxplot.png'))
+  end
+end
+def get_cluster_metadata(clusters_info)
   average_hp_per_pat_distribution = []
-  tmp = []
   clusters_info.each do |cl_id, pat_info|
       hp_per_pat_in_clust = pat_info.values.map{|a| a.length}
       hp_per_pat_ave = hp_per_pat_in_clust.sum.fdiv(hp_per_pat_in_clust.length)
       average_hp_per_pat_distribution << [pat_info.length, hp_per_pat_ave]
-      tmp << hp_per_pat_in_clust
-  end
-  total_clusters = clusters_info.length
-  average_phenotypes_by_cluster = tmp.flatten.sum.fdiv(total_clusters)
-  File.open(output_file, 'w') do |f|
-    f.puts "#{'PatientsNumber'}\t#{'HPOAverage'}"
-    average_hp_per_pat_distribution.each do |patient_num, ave|
-      f.puts "#{patient_num}\t#{ave}"
-    end
   end
+  return average_hp_per_pat_distribution
 end