RubyGems - pets - Versions diffs - 0.2.3 → 0.2.4 - Mend

pets 0.2.3 → 0.2.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (37) hide show

checksums.yaml +4 -4
data/Gemfile +2 -0
data/README.md +79 -5
data/bin/coPatReporter.rb +63 -156
data/bin/comPatMondo.rb +1 -4
data/bin/evidence_profiler.rb +38 -151
data/bin/get_network_nodes.rb +79 -132
data/bin/get_sorted_profs.rb +25 -36
data/bin/install_deps.rb +7 -0
data/bin/paco_translator.rb +29 -72
data/bin/phen2reg.rb +1 -4
data/bin/profiles2phenopacket.rb +110 -0
data/bin/reg2phen.rb +1 -3
data/example_datasets/associations_file.txt +757 -0
data/example_datasets/example_patient.txt +6 -0
data/example_datasets/example_patient_hpos.txt +15 -0
data/example_datasets/genes.txt +8 -0
data/example_datasets/hpo2ci.txt +2798 -0
data/example_datasets/hummu_congenital_full_dataset.txt +4183 -0
data/example_datasets/launch.sh +20 -0
data/external_code/generate_boxpot.R +51 -21
data/external_code/get_clusters.R +2 -2
data/external_code/install_R_dependencies.R +11 -0
data/external_code/plot_heatmap.R +34 -30
data/lib/pets/coPatReporterMethods.rb +143 -441
data/lib/pets/cohort.rb +307 -0
data/lib/pets/constants.rb +7 -0
data/lib/pets/generalMethods.rb +8 -317
data/lib/pets/genomic_features.rb +144 -0
data/lib/pets/io.rb +457 -0
data/lib/pets/parsers/cohort_parser.rb +106 -0
data/lib/pets/version.rb +1 -1
data/lib/pets.rb +8 -0
data/pets.gemspec +1 -0
data/templates/cohort_report.erb +5 -7
data/templates/patient_report.erb +1 -1
metadata +34 -3

data/lib/pets/cohort.rb ADDED Viewed

@@ -0,0 +1,307 @@
+require 'json'
+require 'semtools'
+class Cohort
+	@@ont = {}
+	class << self # https://www.ruby-forum.com/t/attr-accessor-for-class-variable/136693
+		attr_accessor :act_ont # Which ontology use for ont related operations
+	end
+	attr_accessor :profiles
+	def self.get_ontology(ont_id)
+		return @@ont[ont_id]
+	end
+	def self.load_ontology(ont_name, ont_file, excluded_terms_file = nil)
+		ont = nil
+		if !ont_file.include?('.json')
+			if !excluded_terms_file.nil?
+				ont = Ontology.new(file: ont_file, load_file: true, removable_terms: read_excluded_ont_file(excluded_terms_file))
+			else
+				ont = Ontology.new(file: ont_file, load_file: true)
+			end
+		else
+			ont = Ontology.new
+			ont.read(ont_file)
+			if !excluded_terms_file.nil?
+				ont.add_removable_terms(read_excluded_ont_file(excluded_terms_file))
+				ont.remove_removable()
+				ont.build_index()
+			end
+		end
+		@@ont[ont_name] = ont
+	end
+	def self.read_excluded_ont_file(file)
+		excluded_hpo = []
+		File.open(file).each do |line|
+			excluded_hpo << line.chomp
+		end
+		return excluded_hpo
+	end
+	def initialize()
+		@profiles = {}
+		@vars = {}
+		@var_idx = Genomic_Feature.new([])
+	end
+	def add_record(rec) #[id, [profile], [[chr1, start1, stop1],[chr1, start1, stop1]]]
+		id, profile, vars = rec
+		@profiles[id] = profile.map{|t| t.to_sym} if !profile.nil?
+		add_gen_feat(id, vars) if !vars.nil?
+	end
+	def delete(id)
+		@profiles.delete(id)
+		@vars.delete(id)
+	end
+	def select_by_profile!
+		@profiles.select!{|id, profile| yield(id, profile)}
+		current_ids = @profiles.keys
+		@vars.select!{|id, var| current_ids.include?(id)}
+	end
+	def select_by_var!
+		@vars.select!{|id, profile| yield(id, profile)}
+		current_ids = @vars.keys
+		@profiles.select!{|id, var| current_ids.include?(id)}
+	end
+	def filter_by_term_number(n_terms)
+		select_by_profile!{|id, profile| profile.length >= n_terms}
+	end
+	def remove_incomplete_records # remove resc that lacks of vars or phenotypes
+		ids_with_terms = @profiles.keys
+		ids_with_vars = []
+		@vars.each{|id, regs| ids_with_vars << id if regs.length > 0}
+		full_ids = ids_with_vars & ids_with_terms
+		@profiles.select!{|id, prof| full_ids.include?(id)}
+		@vars.select!{|id, var| full_ids.include?(id)}
+	end
+	def add_gen_feat(id, feat_array) # [[chr1, start1, stop1],[chr1, start1, stop1]]
+		@vars[id] = Genomic_Feature.new(feat_array)
+	end
+	def get_profile(id)
+		return @profiles[id]
+	end
+	def get_vars(id)
+		return @vars[id]
+	end
+	def each_profile()
+		@profiles.each do |id, profile|
+			yield(id, profile)
+		end
+	end
+	def each_var()
+		@vars.each do |id, var_info|
+			yield(id, var_info)
+		end
+	end
+	def get_general_profile(thr=0) # TODO move funcionality to semtools
+		term_count = Hash.new(0)
+		each_profile do |id, prof|
+			prof.each do |term|
+				general_profile[prof] += 1
+			end
+		end
+		records = @profiles.length
+		general_profile = []
+		term_count.each do |term, count|
+			general_profile << term if count.fdiv(records) >= thr
+		end
+		ont = @@ont[Cohort.act_ont]
+		return ont.clean_profile_hard(general_profile)
+	end
+	def check(hard=false) # OLD format_patient_data
+		ont = @@ont[Cohort.act_ont]
+		rejected_terms = []
+		rejected_recs = []
+		@profiles.each do |id, terms|
+			if hard
+				terms = ont.clean_profile_hard(terms)
+				rejec_terms = []
+			else
+				terms, rejec_terms = ont.check_ids(terms)
+			end
+			if !rejec_terms.empty?
+				STDERR.puts "WARNING: record #{id} has the unknown CODES '#{rejec_terms.join(',')}'. Codes removed."
+				rejected_terms.concat(rejec_terms)
+			end
+			if terms.empty?
+				rejected_recs << id
+			else
+				@profiles[id] = terms
+			end
+		end
+		@profiles.select!{|id, record| !rejected_recs.include?(id)}
+		@vars.select!{|id, record| !rejected_recs.include?(id)}
+		return rejected_terms.uniq, rejected_recs
+	end
+	def link2ont(ont_id)
+		@@ont[ont_id].load_profiles(@profiles)
+	end
+	def get_profile_redundancy
+		ont = @@ont[Cohort.act_ont]
+		profile_sizes, parental_terms_per_profile = ont.get_profile_redundancy
+		return profile_sizes, parental_terms_per_profile
+	end
+	def get_profiles_terms_frequency(options={})
+		ont = @@ont[Cohort.act_ont]
+		term_stats = ont.get_profiles_terms_frequency(**options) #https://www.ruby-lang.org/en/news/2019/12/12/separation-of-positional-and-keyword-arguments-in-ruby-3-0/
+		return term_stats
+	end
+	def compute_term_list_and_childs()
+		ont = @@ont[Cohort.act_ont]
+		suggested_childs, term_with_childs_ratio = ont.compute_term_list_and_childs()
+	end
+	def get_profile_ontology_distribution_tables()
+		ont = @@ont[Cohort.act_ont]
+		ontology_levels, distribution_percentage = ont.get_profile_ontology_distribution_tables
+		ontology_levels.unshift(["level", "ontology", "cohort"])
+		distribution_percentage.unshift(["level", "ontology", "weighted cohort", "uniq terms cohort"])
+		return ontology_levels, distribution_percentage
+	end
+	def get_ic_analysis()
+		ont = @@ont[Cohort.act_ont]
+		onto_ic, freq_ic = ont.get_observed_ics_by_onto_and_freq # IC for TERMS
+		onto_ic_profile, freq_ic_profile = ont.get_profiles_resnik_dual_ICs # IC for PROFILES
+		return onto_ic, freq_ic, onto_ic_profile, freq_ic_profile
+	end
+	def get_profiles_mean_size
+		ont = @@ont[Cohort.act_ont]
+		profile_mean_size = ont.get_profiles_mean_size
+		return profile_mean_size
+	end
+	def get_profile_length_at_percentile(perc=50, increasing_sort: false)
+		ont = @@ont[Cohort.act_ont]
+		length_percent = ont.get_profile_length_at_percentile(perc=perc, increasing_sort: increasing_sort)
+		return length_percent
+	end
+	def get_dataset_specifity_index(type)
+		ont = @@ont[Cohort.act_ont]
+		dsi = ont.get_dataset_specifity_index(type)
+		return dsi
+	end
+	def compare_profiles(options={})
+		ont = @@ont[Cohort.act_ont]
+		similarities = ont.compare_profiles(**options)
+		return similarities
+	end
+	def index_vars # equivalent to process_patient_data
+		each_var do |id, var|
+			@var_idx.merge(var, id)
+		end
+	end
+	def get_vars_sizes(summary=false)
+		if summary
+			return @var_idx.get_summary_sizes
+		else
+			return @var_idx.get_sizes
+		end
+	end
+	def generate_cluster_regions(meth, tag, lim)
+		@var_idx.generate_cluster_regions(meth, tag, lim)
+	end
+	def save(output_file, mode = :default, translate = false)
+		File.open(output_file, 'w') do |f|
+			f.puts "id\tchr\tstart\tstop\tterms" if mode == 'paco'
+			ont = @@ont[Cohort.act_ont]
+			@profiles.each do |id, terms|
+				terms, rejected = ont.translate_ids(terms) if translate
+				id_variants = @vars[id]
+				variants = []
+				if id_variants.nil? || id_variants.length == 0
+					variants << ['-', '-', '-']
+				else
+					id_variants.each do |chr, reg|
+						variants << [chr, reg[:start], reg[:stop]]
+					end
+				end
+				variants.each do |var|
+					if mode == :default
+						f.puts "#{id}\t#{terms.join('|')}\t#{var.join("\t")}"
+					elsif mode == :paco
+						f.puts "#{id}\t#{var.join("\t")}\t#{terms.join('|')}"
+					else
+						abort('Wrong save mode option, please try default or paco')
+					end
+				end
+			end
+		end
+	end
+	def export_phenopackets(output_folder, genome_assembly, vcf_index: nil)
+		ont = @@ont[Cohort.act_ont]
+		metaData = {
+			"createdBy" => "PETS",
+			"resources" => [{
+				"id" => "hp",
+				"name" => "human phenotype ontology",
+				"namespacePrefix" => "HP",
+				"url" => "http://purl.obolibrary.org/obo/hp.owl",
+#				"version" => "2018-03-08",
+				"iriPrefix" => "http://purl.obolibrary.org/obo/HP_"
+			}]
+		}
+		@profiles.each do |id, terms|
+			phenopacket = {metaData: metaData}
+			phenopacket[:subject] = {id: id}
+			phenotypicFeatures = []
+			terms.each do |term|
+				term_name = ont.translate_id(term)
+				phenotypicFeatures << {
+					type: { id: term, label: term_name},
+					classOfOnset: {"id" => "HP:0003577", "label" => "Congenital onset"}
+				}
+			end
+			phenopacket[:phenotypicFeatures] = phenotypicFeatures
+			if !vcf_index.nil? && vcf_index.include?(id)
+				htsFiles = []
+				htsFiles << {
+					"uri" => "file:/" + vcf_index[id],
+			        "description" => id,
+			        "htsFormat" => "VCF",
+			        "genomeAssembly" => genome_assembly,
+			        "individualToSampleIdentifiers" => { "patient1" => id }
+				}
+				phenopacket[:htsFiles] = htsFiles
+			end
+			File.open(File.join(output_folder, id.to_s + ".json"), "w") { |f| f.write JSON.pretty_generate(phenopacket) }
+			id_variants = @vars[id]
+			variants = []
+			if id_variants.nil? || id_variants.length == 0
+				variants << ['-', '-', '-']
+			else
+				id_variants.each do |chr, reg|
+					variants << [chr, reg[:start], reg[:stop]]
+				end
+			end
+		end
+	end
+end

data/lib/pets/constants.rb ADDED Viewed

@@ -0,0 +1,7 @@
+# Needs define ROOT_PATH constant in file requiring this file
+REPORT_FOLDER = File.expand_path(File.join(ROOT_PATH, '..', 'templates'))
+EXTERNAL_DATA = File.expand_path(File.join(ROOT_PATH, '..', 'external_data'))
+EXTERNAL_CODE = File.expand_path(File.join(ROOT_PATH, '..', 'external_code'))
+HPO_FILE = File.join(EXTERNAL_DATA, 'hp.json')
+MONDO_FILE = File.join(EXTERNAL_DATA, 'mondo.obo')
+IC_FILE = File.join(EXTERNAL_DATA, 'uniq_hpo_with_CI.txt')

data/lib/pets/generalMethods.rb CHANGED Viewed

@@ -13,80 +13,15 @@ def system_call(code_folder, script, args_string)
 	end
 end
-def read_excluded_hpo_file(file)
-	excluded_hpo = []
-	File.open(file).each do |line|
-		excluded_hpo << line.chomp
-	end
-	return excluded_hpo
-end
-#Common methods for predictors
-#Training file example = 9  131371492   131375954   HP:0010974  2.41161970596	9.3.A.5
-#1. Indexing by chr (region)
-def coor_overlap?(ref_start, ref_stop, start, stop)
-  overlap = false
-  if (stop > ref_start && stop <= ref_stop) ||
-    (start >= ref_start && start < ref_stop) ||
-    (start <= ref_start && stop >= ref_stop) ||
-    (start > ref_start && stop < ref_stop)
-    overlap = true
-  end
-  return overlap
-end
-def load_training_file4regions(training_file)
-	training_set = {}
-	posInfo = loadFile(training_file)
-	posInfo.each do |info|
-		chr = info.shift
-		query = training_set[chr]
-		if query.nil?
-			training_set[chr] = [info]
-		else
-			query << info
-		end
-	end
-	return training_set
-end
-#2. Indexing by hpo (code)
-#prepare training file for analysis using phenotype2region prediction
-def load_training_file4HPO(training_file, thresold=0)
-	training_set = {}
-	information = loadFile(training_file, thresold)
-	information.each do |info|
-		hpoCode = info.delete_at(4)
-		query = training_set[hpoCode]
-		if query.nil?
-			training_set[hpoCode] = [info]
-		else
-			query << info
-		end
-	end
-	# STDERR.puts training_set.keys.inspect
-	return training_set
-end
-#3. Load training info file:
-#Chr;Start;Stop;HPO;Association;node
-def loadFile(file, thresold=0)
-	information = []
-	File.open(file).each do |line|
-		line.chomp!
-		allInfo = line.split("\t")
-		associationValue = allInfo[4].to_f
-		if associationValue >= thresold
-			chr = allInfo[0]
-			startPos = allInfo[1].to_i
-			stopPos = allInfo[2].to_i
-			hpoCode = allInfo[3]
-			nodeID = allInfo[5]
-			information << [chr, startPos, stopPos, nodeID, hpoCode, associationValue]
-		end
+def add_record(hash, key, record, uniq=false)
+	query = hash[key]
+	if query.nil?
+		hash[key] = [record]
+	elsif !uniq # We not take care by repeated entries
+		query << record
+	elsif !query.include?(record) # We want uniq entries
+		query << record
 	end
-	return information
 end
@@ -111,105 +46,6 @@ def compute_IC_values(patient_data, total_patients)
 	return patients_per_hpo
 end
-def load_hpo_ci_values(information_coefficient_file)
-	hpos_ci_values = {}
-	File.open(information_coefficient_file).each do |line|
-		line.chomp!
-		hpo_code, ci = line.split("\t")
-		hpos_ci_values[hpo_code.to_sym] = ci.to_f
-	end
-	return hpos_ci_values
-end
-def load_clustered_patients(file)
-	clusters = {}
-	File.open(file).each do |line|
-		line.chomp!
-		pat_id, cluster_id = line.split("\t")
-		query = clusters[cluster_id]
-		if query.nil?
-			clusters[cluster_id] = [pat_id]
-		else
-			query << pat_id
-		end
-	end
-	return clusters
-end
-def load_gene_data(gene_data_path)
-	gene_list = {} #geneID => attr
-	gene_location = {} # chr => gene
-	infile = open(gene_data_path)
-	gz = Zlib::GzipReader.new(infile)
-	current_chr = nil
-	genes = []
-	gz.each_line do |line|
-		line.chomp!
-		next if line =~ /^#/
-		fields = line.split("\t")
-		if fields[8].include?('genome=chromosome')
-			chr = fields[8].split(';')[1].split('=').last
-			gene_location[current_chr] = genes
-			genes = []
-			current_chr = chr
-		elsif fields[2] == 'gene'
-			attributes = {}
-			fields[8].split(';').each do |pair|
-				key, value = pair.split('=')
-				attributes[key] = value
-			end
-			geneName = nil
-			geneName = attributes['gene'] if !attributes['gene'].nil?
-			geneSyns = []
-			geneSyns = attributes['gene_synonym'].split(',') if !attributes['gene_synonym'].nil?
-			description = attributes['description']
-			description = URI.unescape(description) if !description.nil?
-			attributes['Dbxref'] =~ /GeneID:(\d+)/
-			gene_list[$1] = [geneName, geneSyns, description]
-			genes << [$1, fields[3].to_i, fields[4].to_i]
-		end
-	end
-	gene_location[current_chr] = genes
-	return gene_list, gene_location
-end
-def parse_kegg_data(query_genes)
-	kegg_data = {} #gene => attb
-    while !query_genes.empty?
-    	gene_set = query_genes.shift(10)
-	    url = "http://rest.kegg.jp/get/#{gene_set.map{|qg| "hsa:#{qg}"}.join('+')}"
-	    uri = URI(url)
-	    response = Net::HTTP.get(uri)
-		geneID = nil
-		gene_names = []
-		definition = nil
-		pathways = []
-		parsing_pathway_field = false
-		response.squeeze(' ').each_line do |line|
-			line.chomp!
-			if line =~ /^ENTRY/
-				geneID = line.split(' ')[1]
-			elsif line =~ /^NAME/
-				gene_names = line.split(' ', 2).last.split(', ')
-			elsif line =~ /^DEFINITION/
-				definition = line.split(' ', 2)[1]
-			elsif line =~ /^PATHWAY/
-				pathways << line.split(' ', 3)[1..2]
-				parsing_pathway_field = true
-			elsif line =~ /^BRITE/ || line =~ /^POSITION/ || line =~ /^DISEASE/ || line =~ /^MODULE/ || line =~ /^DRUG_TARGET/ || line =~ /^NETWORK/
-				parsing_pathway_field = false
-			elsif parsing_pathway_field
-				pathways << line.strip.split(' ', 2)
-			elsif line == '///'
-				parsing_pathway_field = false
-				kegg_data[geneID] = [gene_names, definition, pathways]
-				pathways = []
-				gene_names = []
-			end
-		end
-	end
-	return kegg_data
-end
 def parse_kegg_from_biosystems(biosystems_gene_path, biosystems_info_path)
 	kegg_data = {}
@@ -270,21 +106,6 @@ def merge_genes_with_kegg_data(gene_list, kegg_data)
 	return merged_data
 end
-def write_compressed_plain_file(data, path)
-	File.open(path, 'w') do |f|
-		gz = Zlib::GzipWriter.new(f)
-		gz.write data.to_json
-		gz.close
-	end
-end
-def read_compressed_json(path)
-	infile = open(path)
-	gz = Zlib::GzipReader.new(infile)
-	object = JSON.parse(gz.read)
-	return object
-end
 def compute_pathway_enrichment(genes_clusters, genes_with_kegg)
 	pathways_genes_in_predictions = {}
 	genes_in_predictions = []
@@ -358,138 +179,8 @@ def binom(n,k)
 	end
 end
-def get_reference(genomic_ranges)
-	#genomic_ranges = [patientID, mut_start, mut_stop]
-	reference = []
-	reference.concat(genomic_ranges.map{|gr| gr[1]})# get start
-	reference.concat(genomic_ranges.map{|gr| gr[2]})# get stop
-	reference.uniq!
-	reference.sort!
-	#Define overlap range
-	final_reference = []
-	reference.each_with_index do |coord,i|
-		next_coord = reference[i + 1]
-		final_reference << [coord, next_coord] if !next_coord.nil?
-	end
-	return final_reference
-end
-def overlap_patients(genomic_ranges, reference)
-	overlaps = []
-	reference.each do |start, stop|
-		patients = []
-		genomic_ranges.each do |pt_id, pt_start, pt_stop|
-			if (start <= pt_start && stop >= pt_stop) ||
-				(start > pt_start && stop < pt_stop) ||
-				(stop > pt_start && stop <= pt_stop) ||
-				(start >= pt_start && start < pt_stop)
-				patients << pt_id
-			end
-		end
-		overlaps << patients.uniq
-	end
-	return overlaps
-end
-def generate_cluster_regions(patients_genomic_region_by_chr, mutation_type, pat_per_reg = 1)
-	patients_out_of_cluster = 0
-	patients_by_cluster = {}
-	sors = []
-	patients_genomic_region_by_chr.each do |chrm, genomic_ranges|
-		reference = get_reference(genomic_ranges) # Get putative overlap regions
-		overlapping_patients = overlap_patients(genomic_ranges, reference) # See what patient has match with a overlap region
-		clust_number = 1
-		reference.each_with_index do |ref, i|
-			current_patients = overlapping_patients[i]
-			if current_patients.length > pat_per_reg
-				ref << chrm
-				node_identifier = "#{chrm}.#{clust_number}.#{mutation_type}.#{current_patients.length}"
-				ref << node_identifier
-				save_sor(current_patients, node_identifier, patients_by_cluster)
-				sors << ref
-				clust_number += 1
-			end
-		end
-	end
-	return patients_by_cluster, sors
-end
-def save_sor(current_patients, node_identifier, patients_by_cluster)
-	current_patients.each do |patient|
-		add_record(patients_by_cluster, patient, node_identifier)
-	end
-end
-def add_record(hash, key, record)
-	query = hash[key]
-	if query.nil?
-		hash[key] = [record]
-	elsif !query.include?(record)
-		query << record
-	end
-end
-def load_patient_cohort(options)
-	patient_data = {}
-	count = 0
-	fields2extract = get_fields2extract(options)
-	field_numbers = fields2extract.values
-	  File.open(options[:input_file]).each do |line|
-	    line.chomp!
-	    if options[:header] && count == 0
-	      line.gsub!(/#\s*/,'') # correct comment like  headers
-	      field_names = line.split("\t")
-	      get_field_numbers2extract(field_names, fields2extract)
-	      field_numbers = fields2extract.values
-	    else
-	      fields = line.split("\t")
-	      pat_record = field_numbers.map{|n| fields[n]}
-	      if fields2extract[:pat_id_col].nil?
-	        pat_id = "pat_#{count}" #generate ids
-	      else
-	        original_id = pat_record.shift
-	        pat_id = original_id + "_i#{count}" # make sure that ids are uniq
-	      end
-	      if !pat_record[0].nil?
-	      	pat_record[0] = pat_record[0].split(options[:hpo_separator])
-	      else
-	      	pat_record[0] = []
-	      end
-	      pat_record[2] = pat_record[2].to_i if !options[:start_col].nil?
-	      pat_record[3] = pat_record[3].to_i if !options[:end_col].nil?
-	      patient_data[pat_id] = pat_record
-	    end
-	    count +=1
-	  end
-	  options[:pat_id_col] = 'generated' if fields2extract[:pat_id_col].nil?
-	  return patient_data
-end
-def get_fields2extract(options)
-	fields2extract = {}
-	[:pat_id_col, :hpo_col, :chromosome_col, :start_col, :end_col].each do |field|
-		col = options[field]
-		if !col.nil?
-			col = col.to_i if !options[:header]
-			fields2extract[field] = col
-		end
-	end
-	return fields2extract
-end
-def get_field_numbers2extract(field_names, fields2extract)
-  fields2extract.each do |field, name|
-    fields2extract[field] = field_names.index(name)
-  end
-end
-def download(ftp_server, path, name)
-  ftp = Net::FTP.new()
-  ftp.connect(ftp_server)
-  ftp.login
-  ftp.getbinaryfile(path, name)
-  ftp.close
-end
 def get_and_parse_external_data(all_paths)
 	sources = [