RubyGems - pets - Versions diffs - 0.2.3 → 0.2.5 - Mend

pets 0.2.3 → 0.2.5

Files changed (42) hide show

checksums.yaml +4 -4
data/Gemfile +2 -0
data/README.md +79 -5
data/bin/coPatReporter.rb +68 -156
data/bin/comPatMondo.rb +1 -4
data/bin/evidence_profiler.rb +102 -150
data/bin/get_gen_features.rb +146 -0
data/bin/get_network_nodes.rb +79 -132
data/bin/get_sorted_profs.rb +25 -36
data/bin/install_deps.rb +8 -0
data/bin/paco_translator.rb +29 -72
data/bin/phen2reg.rb +1 -4
data/bin/profiles2phenopacket.rb +86 -0
data/bin/reg2phen.rb +1 -3
data/example_datasets/associations_file.txt +757 -0
data/example_datasets/example_patient.txt +6 -0
data/example_datasets/example_patient_hpos.txt +15 -0
data/example_datasets/genes.txt +8 -0
data/example_datasets/hpo2ci.txt +2798 -0
data/example_datasets/hummu_congenital_full_dataset.txt +4183 -0
data/example_datasets/launch.sh +20 -0
data/external_code/generate_boxpot.R +51 -21
data/external_code/get_clusters.R +2 -2
data/external_code/install_R_dependencies.R +16 -0
data/external_code/plot_heatmap.R +34 -30
data/lib/pets/coPatReporterMethods.rb +172 -424
data/lib/pets/cohort.rb +309 -0
data/lib/pets/common_optparse.rb +30 -0
data/lib/pets/constants.rb +8 -0
data/lib/pets/generalMethods.rb +29 -319
data/lib/pets/genomic_features.rb +240 -0
data/lib/pets/io.rb +481 -0
data/lib/pets/parsers/cohort_parser.rb +111 -0
data/lib/pets/parsers/reference_parser.rb +39 -0
data/lib/pets/version.rb +1 -1
data/lib/pets.rb +9 -0
data/pets.gemspec +7 -3
data/templates/cluster_report.erb +25 -5
data/templates/cohort_report.erb +5 -7
data/templates/evidence_profile.erb +20 -4
data/templates/patient_report.erb +1 -1
metadata +96 -5

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: d5b4a8a64787d15e3741fac94c70855625e040b670d03aec8b5df2cf1f7d6a95
-  data.tar.gz: a6fdec46047c84df897d2f63aee4ebd9acfb888c33f77a90c38b5e19821558cf
+  metadata.gz: 0f1d5c3ad0cb57b26b2c67e02b38a282139965472ded083acf0d1fcae48c0fec
+  data.tar.gz: 8b34f2440afe74f0b9c0e6024c2a05daee4a7be0efd0c6a3d80aef49673c7c7a
 SHA512:
-  metadata.gz: 1956197b3c36a3f4bd34722ed201f1a50cb30fb0b40976ef330186d238039e0d392b0a68b99f6478b13c288c0201f6b17859e8efe88700f05cac4f43301c1e84
-  data.tar.gz: 96465011731776d68719b53e548cf70475f6e4467c261a70da040b22d418eff556f9c0f09092f030209fba35d41cb02ab8ce9b3819f8b4be04a30b360f569a84
+  metadata.gz: d3e9bc8559bb3f3e0c9a7ce1e0658645f54afc83fc49e7415cc3177576af975e4f7268a1f37e017d83fd88042197f102a4290cc29d7b0a16e12ec4964feea39d
+  data.tar.gz: a2aa8fe161b52d2f3e86e0d04f2a1a762298de95582098e553287ad905ff7b97eae6f96893de8bd78676c01b412a16973b1b53cf6ebbd9cb48e49c929c7f1d74

data/Gemfile CHANGED Viewed

@@ -4,3 +4,5 @@ source "https://rubygems.org"
 gemspec
 semtools_dev_path = File.expand_path('~/dev_gems/semtools')
 gem "semtools", github: "seoanezonjic/semtools", branch: "master" if Dir.exists?(semtools_dev_path)
+expcalc_dev_path = File.expand_path('~/dev_gems/expcalc')
+gem "expcalc", github: "seoanezonjic/expcalc", branch: "master" if Dir.exist?(expcalc_dev_path)

data/README.md CHANGED Viewed

@@ -1,10 +1,11 @@
-# Pets
+# PETS
-Pets (Patient exploration tools suite) include tools for the analysis of cohorts of patients with pathological phenotypes described in terms of the Human Phenotype Ontology (HPO) and the position their genomic variants clinically determined.
+PETS (Patient Exploration Tools Suite) include three different tools for the analysis of cohorts of patients with pathological phenotypes described in terms of the Human Phenotype Ontology (HPO) and the position their genomic variants clinically determined.
-Pets include tools to (1) perform cohort analysis (coPatReporter.rb), (2) searching for pathological phenotypes associated with a genomic region of interest (reg2phen.rb) and (3) predict regions of the genome that potentially lead to the pathological phenotypes observed in a patient (phen2reg.rb).
+It can (1) determine the quality of information within a patient cohort with Cohort Analyzer (coPatReporter.rb); (2) associate genomic regions with their pathological phenotypes based on the cohort data with Reg2Phen (reg2phen.rb), and (3) predict the possible genetic variants that cause the clinically observed pathological phenotypes using phenotype-genotype association values with Phen2Reg (phen2reg.rb).
+This tool has been developed to be used by the clinical community, to facilitate patient characterisation, help identify where data quality can be improved within a cohort and help diagnose patients with complex disease. Please cite us as Rojano E., Seoane-Zonjic P., Jabato F.M., Perkins J.R., Ranea J.A.G. (2020) Comprehensive Analysis of Patients with Undiagnosed Genetic Diseases Using the Patient Exploration Tools Suite (PETS). In: Rojas I., Valenzuela O., Rojas F., Herrera L., Ortuño F. (eds) Bioinformatics and Biomedical Engineering. IWBBIO 2020. Lecture Notes in Computer Science, vol 12108. Springer, Cham. https://doi.org/10.1007/978-3-030-45385-5_69.
-Associations between pathological phenotypes and genomic regions (using genomic coordinates from GRCh37 human assembly) are previously calculated using NetAnalyzer (https://rubygems.org/gems/NetAnalyzer). Please cite us as Rojano E. et al (2017). Revealing the Relationship Between Human Genome Regions and Pathological Phenotypes Through Network Analysis. LNCS, 10208:197-207.
 ## Installation
@@ -22,9 +23,82 @@ Or install it yourself as:
     $ gem install pets
+After installing PETS Gem, R dependencies must be installed. For this, the user must run the following command:
+    $ install_deps.rb
 ## Usage
-TODO: Write usage instructions here
+### 1) Cohort Analyzer
+Cohort Analyzer measures the phenotyping quality of patient and disease cohorts by calculating multiple statistics to give a general overview of the cohort status in terms of the depth and breadth of phenotyping. It can work with cohorts defined exclusively with HPO terms or with both HPO terms and genomic coordinates.
+#### Basic usage of Cohort Analyzer:
+We provide an example of use of Cohort Analyzer with a dataset from Vulto-van Silfhout, A.T.; Hehir-Kwa, J.Y.; van Bon, B.W.M.; Schuurs-Hoeijmakers, J.H.M.; Meader, S.; Hellebrekers, C.J.M.; Thoonen, I.J.M.; de Brouwer, A.P.M.; Brunner, H.G.; Webber, C.; Pfundt, R.; de Leeuw, N.; De Vries, B.B.A. Clinical Significance of De Novo and Inherited Copy-Number Variation. Human Mutation 2013, 34, 1679–1687. doi:10.1002/humu.22442.
+This dataset includes de novo and inherited CNVs to phenotypes related to intellectual disability/developmental delay occurring alongside multiple congenital anomalies. An example of an input file is available in the example_datasets folder within this repository and the code to execute its analysis is provided below:
+```
+coPatReporter.rb -i hummu_congenital_full_dataset.txt -o results -p phenotypes -c chr -d patient_id -s start -e stop -m lin
+```
+Where:
+- -i -> Input cohort, a tab file with patient identifiers and the list of HPOs characterised for each patient.
+- -o -> Output path.
+- -p -> Column name with phenotypes.
+- -c -> Column name with chromosomes.
+- -d -> Column name with patient identifiers.
+- -s -> Column name with start genomic coordinate.
+- -e -> Column name with final genomic coordinate.
+- -m -> Semantic similarity measure method.
+- -C -> Maximum number of clusters to display.
+Further information with all Cohort Analyzer capabilities for setup can be queried as follows:
+```
+coPatReporter.rb --help
+```
+### 2) Reg2Phen
+This tool is a search engine that finds phenotypes associated with genomic regions or genes of interest. It uses two input files, one with phenotype-genotype associations previously calculated, and a list of genomic coordinates or gene identifiers to find their HPO associated. We provide an example of use in the example_datasets folder within this repository and the code to execute its analysis is provided below:
+```
+reg2phen.rb -t associations_file.txt -p genes.txt -b hpo_file -P -g -H -o results/patient1Genes.txt -F $current/results/patient1Genes.html
+```
+Where:
+- -t -> Input phenotype-genotype associations file.
+- -p -> List of genes to find HPOs associated.
+- -b -> HPO obo file.
+- -P -> Transform association values in P-values.
+- -g -> Set if genes identifiers are provided instead of genome coordinates.
+- -H -> Activate HTML reporting.
+- -o -> Output folder.
+- -F -> Semantic similarity measure method.
+Associations between pathological phenotypes and genomic regions provided in this example were calculated with NetAnalyzer (https://rubygems.org/gems/NetAnalyzer, Rojano E. et al (2017). Revealing the Relationship Between Human Genome Regions and Pathological Phenotypes Through Network Analysis. LNCS, 10208:197-207) using randomised DECIPHER data (coordinates in the GRCh37 human genome assembly) and the hypergeometric association method.
+### 3) Phen2Reg
+Phen2Reg analyses the pathological phenotypes observed in a patient and predicts putative causal genomic regions. As in the case of Reg2Phen, it uses phenotype-genotype associations previously calculated. We provide an example of use in the example_datasets folder within this repository and the code to execute its analysis is provided below:
+```
+phen2reg.rb -t associations_file.txt -p example_patient_hpos.txt -i hpo2ci.txt -f hpo_file -T -Q > single_phens.txt
+```
+Where:
+- -t -> Input phenotype-genotype associations file.
+- -p -> List of HPOs characterised for a patient.
+- -i -> HPO information coefficients (IC) file.
+- -f -> HPO obo file.
+- -T -> Deactivate HTML reporting.
+- -Q -> Deactivate quality control.
+Results are saved in the single_phens.txt output file.
 ## Development

data/bin/coPatReporter.rb CHANGED Viewed

@@ -1,45 +1,13 @@
 #! /usr/bin/env ruby
 ROOT_PATH = File.dirname(__FILE__)
-REPORT_FOLDER = File.expand_path(File.join(ROOT_PATH, '..', 'templates'))
-EXTERNAL_DATA = File.expand_path(File.join(ROOT_PATH, '..', 'external_data'))
-EXTERNAL_CODE = File.expand_path(File.join(ROOT_PATH, '..', 'external_code'))
-HPO_FILE = File.join(EXTERNAL_DATA, 'hp.json')
-IC_FILE = File.join(EXTERNAL_DATA, 'uniq_hpo_with_CI.txt')
 $: << File.expand_path(File.join(ROOT_PATH, '..', 'lib', 'pets'))
 require 'benchmark'
 require 'parallel'
 require 'optparse'
-require 'csv'
-require 'npy'
-require 'generalMethods.rb'
-require 'coPatReporterMethods.rb'
 require 'report_html'
-require 'semtools'
-#Expand class (semtools modifications if necessary):
-class Ontology
-end
-##########################
-# FUNCTIONS
-##########################
-def translate_codes(clusters, hpo)
-  translated_clusters = []
-  clusters.each do |clusterID, num_of_pats, patientIDs_ary, patient_hpos_ary|
-        translate_codes = patient_hpos_ary.map{|patient_hpos| patient_hpos.map{|hpo_code| hpo.translate_id(hpo_code)}}
-        translated_clusters << [clusterID,
-          num_of_pats,
-          patientIDs_ary,
-          patient_hpos_ary,
-          translate_codes
-        ]
-  end
-  return translated_clusters
-end
+require 'pets'
 ##########################
 #OPT-PARSER
@@ -69,9 +37,14 @@ OptionParser.new do |opts|
     options[:chromosome_col] = data
   end
-  options[:pat_id_col] = nil
+  options[:id_col] = nil
   opts.on("-d", "--pat_id_col INTEGER/STRING", "Column name if header is true, otherwise 0-based position of the column with the patient id") do |data|
-    options[:pat_id_col] = data
+    options[:id_col] = data
+  end
+  options[:detailed_clusters] = false
+  opts.on("-D", "--detailed_clusters", "Show detiled cluster comparation using heatmaps. Default false") do
+    options[:detailed_clusters] = true
   end
   options[:excluded_hpo] = nil
@@ -120,9 +93,9 @@ OptionParser.new do |opts|
     options[:clustering_methods] = data.split(',')
   end
-  options[:hpo_names] = false
+  options[:names] = false
   opts.on("-n", "--hpo_names", "Define if the input HPO are human readable names. Default false") do
-    options[:hpo_names] = true
+    options[:names] = true
   end
   options[:output_file] = nil
@@ -135,14 +108,14 @@ OptionParser.new do |opts|
     options[:hpo_file] = value
   end
-  options[:hpo_col] = nil
+  options[:ont_col] = nil
   opts.on("-p", "--hpo_term_col INTEGER/STRING", "Column name if header true or 0-based position of the column with the HPO terms") do |data|
-  	options[:hpo_col] = data
+  	options[:ont_col] = data
   end
-  options[:hpo_separator] = '|'
+  options[:separator] = '|'
   opts.on("-S", "--hpo_separator STRING", "Set which character must be used to split the HPO profile. Default '|'") do |data|
-  	options[:hpo_separator] = data
+  	options[:separator] = data
   end
   options[:start_col] = nil
@@ -165,7 +138,15 @@ OptionParser.new do |opts|
     options[:threads] = data.to_i
   end
+  options[:reference_profiles] = nil
+  opts.on("--reference_profiles PATH", "Path to file tabulated file with first column as id profile and second column with ontology terms separated by separator. ") do |opt|
+    options[:reference_profiles] = opt
+  end
+  options[:sim_thr] = nil
+  opts.on("--sim_thr FLOAT", "Keep pairs with similarity value >= FLOAT. ") do |opt|
+    options[:sim_thr] = opt.to_f
+  end
   opts.on_tail("-h", "--help", "Show this message") do
     puts opts
@@ -203,80 +184,68 @@ cluster_ic_data_file = File.join(temp_folder, 'cluster_ic_data.txt')
 cluster_chromosome_data_file = File.join(temp_folder, 'cluster_chromosome_data.txt')
 coverage_to_plot_file = File.join(temp_folder, 'coverage_data.txt')
 sor_coverage_to_plot_file = File.join(temp_folder, 'sor_coverage_data.txt')
+ronto_file = File.join(temp_folder, 'hpo_freq_colour')
 Dir.mkdir(temp_folder) if !File.exists?(temp_folder)
 hpo_file = !ENV['hpo_file'].nil? ? ENV['hpo_file'] : HPO_FILE
-hpo = load_hpo_ontology(hpo_file, options[:excluded_hpo])
+Cohort.load_ontology(:hpo, hpo_file, options[:excluded_hpo])
+Cohort.act_ont = :hpo
-patient_data = load_patient_cohort(options)
+patient_data, rejected_hpos_L, rejected_patients_L = Cohort_Parser.load(options)
+rejected_hpos_C, rejected_patients_C = patient_data.check
+rejected_hpos = rejected_hpos_L | rejected_hpos_C
+rejected_patients = rejected_patients_L + rejected_patients_C
+File.open(rejected_file, 'w'){|f| f.puts (rejected_patients).join("\n")}
-rejected_hpos, rejected_patients = format_patient_data(patient_data, options, hpo)
-File.open(rejected_file, 'w'){|f| f.puts rejected_patients.join("\n")}
-patient_data.select!{|pat_id, patient_record| !rejected_patients.include?(pat_id)}
-patient_uniq_profiles, equivalence = get_uniq_hpo_profiles(patient_data)
-hpo.load_profiles(patient_uniq_profiles)
+patient_data.link2ont(Cohort.act_ont) # TODO: check if method load should call to this and use the semtools checking methods (take care to only remove invalid terms)
-profile_sizes, parental_hpos_per_profile = get_profile_redundancy(hpo)
-clean_patient_profiles(hpo, patient_uniq_profiles)
-cohort_hpos, suggested_childs, fraction_terms_specific_childs = compute_hpo_list_and_childs(patient_uniq_profiles, hpo)
-ontology_levels, distribution_percentage = get_profile_ontology_distribution_tables(hpo)
+profile_sizes, parental_hpos_per_profile = patient_data.get_profile_redundancy
+patient_data.check(hard=true)
+hpo_stats = patient_data.get_profiles_terms_frequency() # hpo NAME, freq
+hpo_stats.each{ |stat| stat[1] = stat[1]*100}
+File.open(hpo_frequency_file, 'w') do |f|
+  patient_data.get_profiles_terms_frequency(translate: false).each do |hpo_code, freq| # hpo CODE, freq
+    f.puts "#{hpo_code.to_s}\t#{freq}"
+  end
+end
+suggested_childs, fraction_terms_specific_childs = patient_data.compute_term_list_and_childs()
+ontology_levels, distribution_percentage = patient_data.get_profile_ontology_distribution_tables()
+onto_ic, freq_ic, onto_ic_profile, freq_ic_profile = patient_data.get_ic_analysis()
-onto_ic, freq_ic = hpo.get_observed_ics_by_onto_and_freq # IC for TERMS
-onto_ic_profile, freq_ic_profile = hpo.get_profiles_resnik_dual_ICs # IC for PROFILES
 if options[:ic_stats] == 'freq_internal'
-  ic_file = ENV['ic_file']
-  ic_file = IC_FILE if ic_file.nil?
+  ic_file = !ENV['ic_file'].nil? ? ENV['ic_file'] : IC_FILE
   freq_ic = load_hpo_ci_values(ic_file)
   phenotype_ic = freq_ic
   freq_ic_profile = {}
-  patient_uniq_profiles.each do |pat_id, phenotypes|
+  patient_data.each_profile do |pat_id, phenotypes|
     freq_ic_profile[pat_id] = get_profile_ic(phenotypes, phenotype_ic)
   end
-else
-  if options[:ic_stats] == 'freq'
-    phenotype_ic = freq_ic
-  elsif options[:ic_stats] == 'onto'
-    phenotype_ic = onto_ic
-  end
+elsif options[:ic_stats] == 'freq'
+  phenotype_ic = freq_ic
+elsif options[:ic_stats] == 'onto'
+  phenotype_ic = onto_ic
 end
-clustered_patients = cluster_patients(patient_uniq_profiles, cohort_hpos, matrix_file, clustered_patients_file)
-all_ics, profile_lengths, cluster_data_by_chromosomes, top_cluster_phenotypes, multi_chromosome_patients = process_clustered_patients(options, clustered_patients, patient_uniq_profiles, patient_data, equivalence, hpo, phenotype_ic, options[:pat_id_col])
-get_patient_hpo_frequency(patient_uniq_profiles, hpo_frequency_file)
-summary_stats = get_summary_stats(patient_uniq_profiles, rejected_patients, cohort_hpos, hpo)
-summary_stats << ['Percentage of HPO with more specific children', (fraction_terms_specific_childs * 100).round(4)]
-summary_stats << ['DsI for uniq HP terms', hpo.get_dataset_specifity_index('uniq')]
-summary_stats << ['DsI for frequency weigthed HP terms', hpo.get_dataset_specifity_index('weigthed')]
+clustered_patients = dummy_cluster_patients(patient_data.profiles, matrix_file, clustered_patients_file)
+all_ics, prof_lengths, clust_by_chr, top_clust_phen, multi_chr_pats = process_dummy_clustered_patients(options, clustered_patients, patient_data, phenotype_ic)
-hpo_stats = hpo.get_profiles_terms_frequency()
-hpo_stats.each{ |stat| stat[1] = stat[1]*100}
-summary_stats << ['Number of unknown phenotypes', rejected_hpos.length]
+summary_stats = get_summary_stats(patient_data, rejected_patients, hpo_stats, fraction_terms_specific_childs, rejected_hpos)
 all_cnvs_length = []
 if !options[:chromosome_col].nil?
-  summary_stats << ['Number of clusters with mutations accross > 1 chromosomes', multi_chromosome_patients]
+  summary_stats << ['Number of clusters with mutations accross > 1 chromosomes', multi_chr_pats]
   #----------------------------------
   # Prepare data to plot coverage
   #----------------------------------
   if options[:coverage_analysis]
-    processed_patient_data = process_patient_data(patient_data)
-    cnv_sizes = []
-    processed_patient_data.each do |chr, metadata|
-      metadata.each do |patientID, start, stop|
-        cnv_sizes << stop - start
-      end
-    end
-    cnv_size_average = cnv_sizes.inject{ |sum, el| sum + el }.fdiv(cnv_sizes.length.to_f)
-    patients_by_cluster, sors = generate_cluster_regions(processed_patient_data, 'A', 0)
-    total_patients_sharing_sors = []
-    all_patients = patients_by_cluster.keys
-    all_patients.each do |identifier|
-      total_patients_sharing_sors << identifier.split('_i').first
-    end
-    all_cnvs_length = get_cnvs_length(patient_data)
+    patient_data.index_vars
+    all_cnvs_length = patient_data.get_vars_sizes(true)
+    cnv_size_average = get_mean_size(all_cnvs_length)
+    patients_by_cluster, sors = patient_data.generate_cluster_regions(:reg_overlap, 'A', 0)
     ###1. Process CNVs
     raw_coverage, n_cnv, nt, pats_per_region = calculate_coverage(sors)
     summary_stats << ['Average variant size', cnv_size_average.round(4)]
@@ -288,7 +257,7 @@ if !options[:chromosome_col].nil?
     ###2. Process SORs
     raw_sor_coverage, n_sor, nt, pats_per_region = calculate_coverage(sors, options[:patients_filter] - 1)
     summary_stats << ["Number of genome window shared by >= #{options[:patients_filter]} patients", n_sor]
-    summary_stats << ["Number of patients with at least 1 SOR", total_patients_sharing_sors.uniq.length]
+    summary_stats << ["Number of patients with at least 1 SOR", patients_by_cluster.length]
     summary_stats << ['Nucleotides affected by mutations', nt]
     # summary_stats << ['Patient average per region', pats_per_region]
     sor_coverage_to_plot = get_final_coverage(raw_sor_coverage, options[:bin_size])
@@ -304,20 +273,16 @@ write_detailed_hpo_profile_evaluation(suggested_childs, detailed_profile_evaluat
 write_arrays4scatterplot(onto_ic.values, freq_ic.values, hpo_ic_file, 'OntoIC', 'FreqIC') # hP terms
 write_arrays4scatterplot(onto_ic_profile.values, freq_ic_profile.values, hpo_profile_ic_file, 'OntoIC', 'FreqIC') #HP profiles
 write_arrays4scatterplot(profile_sizes, parental_hpos_per_profile, parents_per_term_file, 'ProfileSize', 'ParentTerms')
+write_cluster_ic_data(all_ics, prof_lengths, cluster_ic_data_file, options[:clusters2graph])
 system_call(EXTERNAL_CODE, 'plot_scatterplot_simple.R', "-i #{hpo_ic_file} -o #{File.join(temp_folder, 'hpo_ics.pdf')} -x 'OntoIC' -y 'FreqIC' --x_tag 'HP Ontology IC' --y_tag 'HP Frequency based IC' --x_lim '0,4.5' --y_lim '0,4.5'") if !File.exists?(File.join(temp_folder, 'hpo_ics.pdf'))
 system_call(EXTERNAL_CODE, 'plot_scatterplot_simple.R', "-i #{hpo_profile_ic_file} -o #{File.join(temp_folder, 'hpo_profile_ics.pdf')} -x 'OntoIC' -y 'FreqIC' --x_tag 'HP Ontology Profile IC' --y_tag 'HP Frequency based Profile IC' --x_lim '0,4.5' --y_lim '0,4.5'") if !File.exists?(File.join(temp_folder, 'hpo_profile_ics.pdf'))
 system_call(EXTERNAL_CODE, 'plot_scatterplot_simple.R', "-i #{parents_per_term_file} -o #{File.join(temp_folder, 'parents_per_term.pdf')} -x 'ProfileSize' -y 'ParentTerms' --x_tag 'Patient HPO profile size' --y_tag 'Parent HPO terms within the profile'")
-###Cohort frequency calculation
-ronto_file = File.join(temp_folder, 'hpo_freq_colour')
-system_call(EXTERNAL_CODE, 'ronto_plotter.R', "-i #{hpo_frequency_file} -o #{ronto_file} --root_node #{options[:root_node]} -O #{hpo_file.gsub('.json','.obo')}") if !File.exist?(ronto_file + '.png')
-write_cluster_ic_data(all_ics, profile_lengths, cluster_ic_data_file, options[:clusters2graph])
+system_call(EXTERNAL_CODE, 'ronto_plotter.R', "-i #{hpo_frequency_file} -o #{ronto_file} --root_node #{options[:root_node]} -O #{hpo_file.gsub('.json','.obo')}") if !File.exist?(ronto_file + '.png') ###Cohort frequency calculation
 system_call(EXTERNAL_CODE, 'plot_boxplot.R', "#{cluster_ic_data_file} #{temp_folder} cluster_id ic 'Cluster size/id' 'Information coefficient' 'Plen' 'Profile size'")
 if !options[:chromosome_col].nil?
-  write_cluster_chromosome_data(cluster_data_by_chromosomes, cluster_chromosome_data_file, options[:clusters2graph])
+  write_cluster_chromosome_data(clust_by_chr, cluster_chromosome_data_file, options[:clusters2graph])
   system_call(EXTERNAL_CODE, 'plot_scatterplot.R', "#{cluster_chromosome_data_file} #{temp_folder} cluster_id chr count 'Cluster size/id' 'Chromosome' 'Patients'")
   if options[:coverage_analysis]
     ###1. Process CNVs
@@ -332,69 +297,16 @@ end
 #----------------------------------
 # CLUSTER COHORT ANALYZER REPORT
 #----------------------------------
-Parallel.each(options[:clustering_methods], in_processes: options[:threads] ) do |method_name|
-  matrix_filename = File.join(temp_folder, "similarity_matrix_#{method_name}.npy")
-  axis_file = matrix_filename.gsub('.npy','.lst')
-  profiles_similarity_filename = File.join(temp_folder, ['profiles_similarity', method_name].join('_').concat('.txt'))
-  clusters_distribution_filename = File.join(temp_folder, ['clusters_distribution', method_name].join('_').concat('.txt'))
-  if !File.exists?(matrix_filename)
-    profiles_similarity = hpo.compare_profiles(sim_type: method_name.to_sym)
-    write_profile_pairs(profiles_similarity, profiles_similarity_filename)
-    similarity_matrix, axis_names = format_profiles_similarity_data_numo(profiles_similarity)
-    File.open(axis_file, 'w'){|f| f.print axis_names.join("\n") }
-    Npy.save(matrix_filename, similarity_matrix)
-  end
-  ext_var = ''
-  if method_name == 'resnik'
-    ext_var = '-m max'
-  elsif method_name == 'lin'
-    ext_var = '-m comp1'
-  end
-  out_file = File.join(temp_folder, method_name)
-  system_call(EXTERNAL_CODE, 'plot_heatmap.R', "-y #{axis_file} -d #{matrix_filename} -o #{out_file} -M #{options[:minClusterProportion]} -t dynamic -H #{ext_var}") if !File.exists?(out_file +  '_heatmap.png')
-  clusters_codes, clusters_info = parse_clusters_file(File.join(temp_folder, "#{method_name}_clusters.txt"), patient_uniq_profiles)
-  get_cluster_metadata(clusters_info, clusters_distribution_filename)
-  out_file = File.join(temp_folder, ['clusters_distribution', method_name].join('_'))
-  system_call(EXTERNAL_CODE, 'xyplot_graph.R', "-d #{clusters_distribution_filename} -o #{out_file} -x PatientsNumber -y HPOAverage") if !File.exists?(out_file)
-  clusters = translate_codes(clusters_codes, hpo)
-  container = {
-    :temp_folder => temp_folder,
-    :cluster_name => method_name,
-    :clusters => clusters,
-    :hpo => hpo
-   }
-  template = File.open(File.join(REPORT_FOLDER, 'cluster_report.erb')).read
-  report = Report_html.new(container, 'Patient clusters report')
-  report.build(template)
-  report.write(options[:output_file]+"_#{method_name}_clusters.html")
-end
-system_call(EXTERNAL_CODE, 'generate_boxpot.R', "-i #{temp_folder} -o #{File.join(temp_folder, 'sim_boxplot')}") if !File.exists?(File.join(temp_folder, 'sim_boxplot.png'))
+get_semantic_similarity_clustering(options, patient_data, temp_folder)
 #----------------------------------
 # GENERAL COHORT ANALYZER REPORT
 #----------------------------------
-total_patients = 0
-new_cluster_phenotypes = {}
-phenotypes_frequency = Hash.new(0)
-top_cluster_phenotypes.each_with_index do |cluster, clusterID|
-  total_patients = cluster.length
-  cluster.each do |phenotypes|
-    phenotypes.each do |p|
-      phenotypes_frequency[p] += 1
-    end
-  end
-  new_cluster_phenotypes[clusterID] = [total_patients, phenotypes_frequency.keys, phenotypes_frequency.values.map{|v| v.fdiv(total_patients) * 100}]
-  phenotypes_frequency = Hash.new(0)
-end
+new_cluster_phenotypes = get_top_dummy_clusters_stats(top_clust_phen)
 container = {
   :temp_folder => temp_folder,
-  # :top_cluster_phenotypes => top_cluster_phenotypes.length,
+  # :top_clust_phen => top_clust_phen.length,
   :summary_stats => summary_stats,
   :clustering_methods => options[:clustering_methods],
   :hpo_stats => hpo_stats,
@@ -413,8 +325,8 @@ new_cluster_phenotypes.each do |clusterID, info|
     container["clust_#{clusterID}"] = clust_info
     clust_info = []
 end
 template = File.open(File.join(REPORT_FOLDER, 'cohort_report.erb')).read
 report = Report_html.new(container, 'Cohort quality report')
 report.build(template)
-report.write(options[:output_file]+'.html')
+report.write(options[:output_file]+'.html')

data/bin/comPatMondo.rb CHANGED Viewed

@@ -4,15 +4,12 @@
 # @author Fernando Moreno Jabato <jabato(at)uma(dot)es>
 ROOT_PATH = File.dirname(__FILE__)
-EXTERNAL_DATA = File.expand_path(File.join(ROOT_PATH, '..', 'external_data'))
-MONDO_FILE = File.join(EXTERNAL_DATA, 'mondo.obo')
-HPO_FILE = File.join(EXTERNAL_DATA, 'hp.obo')
-EXTERNAL_CODE = File.expand_path(File.join(ROOT_PATH, '..', 'external_code'))
 $: << File.expand_path(File.join(ROOT_PATH, '..', 'lib', 'pets'))
 require 'optparse'
 require 'semtools'
 require 'csv'
+require 'constants.rb'
 require 'coPatReporterMethods.rb'
 ##########################