RubyGems - pets - Versions diffs - 0.2.3 → 0.2.5 - Mend

pets 0.2.3 → 0.2.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (42) hide show

checksums.yaml +4 -4
data/Gemfile +2 -0
data/README.md +79 -5
data/bin/coPatReporter.rb +68 -156
data/bin/comPatMondo.rb +1 -4
data/bin/evidence_profiler.rb +102 -150
data/bin/get_gen_features.rb +146 -0
data/bin/get_network_nodes.rb +79 -132
data/bin/get_sorted_profs.rb +25 -36
data/bin/install_deps.rb +8 -0
data/bin/paco_translator.rb +29 -72
data/bin/phen2reg.rb +1 -4
data/bin/profiles2phenopacket.rb +86 -0
data/bin/reg2phen.rb +1 -3
data/example_datasets/associations_file.txt +757 -0
data/example_datasets/example_patient.txt +6 -0
data/example_datasets/example_patient_hpos.txt +15 -0
data/example_datasets/genes.txt +8 -0
data/example_datasets/hpo2ci.txt +2798 -0
data/example_datasets/hummu_congenital_full_dataset.txt +4183 -0
data/example_datasets/launch.sh +20 -0
data/external_code/generate_boxpot.R +51 -21
data/external_code/get_clusters.R +2 -2
data/external_code/install_R_dependencies.R +16 -0
data/external_code/plot_heatmap.R +34 -30
data/lib/pets/coPatReporterMethods.rb +172 -424
data/lib/pets/cohort.rb +309 -0
data/lib/pets/common_optparse.rb +30 -0
data/lib/pets/constants.rb +8 -0
data/lib/pets/generalMethods.rb +29 -319
data/lib/pets/genomic_features.rb +240 -0
data/lib/pets/io.rb +481 -0
data/lib/pets/parsers/cohort_parser.rb +111 -0
data/lib/pets/parsers/reference_parser.rb +39 -0
data/lib/pets/version.rb +1 -1
data/lib/pets.rb +9 -0
data/pets.gemspec +7 -3
data/templates/cluster_report.erb +25 -5
data/templates/cohort_report.erb +5 -7
data/templates/evidence_profile.erb +20 -4
data/templates/patient_report.erb +1 -1
metadata +96 -5

data/lib/pets/generalMethods.rb CHANGED Viewed

@@ -13,80 +13,15 @@ def system_call(code_folder, script, args_string)
 	end
 end
-def read_excluded_hpo_file(file)
-	excluded_hpo = []
-	File.open(file).each do |line|
-		excluded_hpo << line.chomp
-	end
-	return excluded_hpo
-end
-#Common methods for predictors
-#Training file example = 9  131371492   131375954   HP:0010974  2.41161970596	9.3.A.5
-#1. Indexing by chr (region)
-def coor_overlap?(ref_start, ref_stop, start, stop)
-  overlap = false
-  if (stop > ref_start && stop <= ref_stop) ||
-    (start >= ref_start && start < ref_stop) ||
-    (start <= ref_start && stop >= ref_stop) ||
-    (start > ref_start && stop < ref_stop)
-    overlap = true
-  end
-  return overlap
-end
-def load_training_file4regions(training_file)
-	training_set = {}
-	posInfo = loadFile(training_file)
-	posInfo.each do |info|
-		chr = info.shift
-		query = training_set[chr]
-		if query.nil?
-			training_set[chr] = [info]
-		else
-			query << info
-		end
-	end
-	return training_set
-end
-#2. Indexing by hpo (code)
-#prepare training file for analysis using phenotype2region prediction
-def load_training_file4HPO(training_file, thresold=0)
-	training_set = {}
-	information = loadFile(training_file, thresold)
-	information.each do |info|
-		hpoCode = info.delete_at(4)
-		query = training_set[hpoCode]
-		if query.nil?
-			training_set[hpoCode] = [info]
-		else
-			query << info
-		end
-	end
-	# STDERR.puts training_set.keys.inspect
-	return training_set
-end
-#3. Load training info file:
-#Chr;Start;Stop;HPO;Association;node
-def loadFile(file, thresold=0)
-	information = []
-	File.open(file).each do |line|
-		line.chomp!
-		allInfo = line.split("\t")
-		associationValue = allInfo[4].to_f
-		if associationValue >= thresold
-			chr = allInfo[0]
-			startPos = allInfo[1].to_i
-			stopPos = allInfo[2].to_i
-			hpoCode = allInfo[3]
-			nodeID = allInfo[5]
-			information << [chr, startPos, stopPos, nodeID, hpoCode, associationValue]
-		end
+def add_record(hash, key, record, uniq=false)
+	query = hash[key]
+	if query.nil?
+		hash[key] = [record]
+	elsif !uniq # We not take care by repeated entries
+		query << record
+	elsif !query.include?(record) # We want uniq entries
+		query << record
 	end
-	return information
 end
@@ -111,105 +46,6 @@ def compute_IC_values(patient_data, total_patients)
 	return patients_per_hpo
 end
-def load_hpo_ci_values(information_coefficient_file)
-	hpos_ci_values = {}
-	File.open(information_coefficient_file).each do |line|
-		line.chomp!
-		hpo_code, ci = line.split("\t")
-		hpos_ci_values[hpo_code.to_sym] = ci.to_f
-	end
-	return hpos_ci_values
-end
-def load_clustered_patients(file)
-	clusters = {}
-	File.open(file).each do |line|
-		line.chomp!
-		pat_id, cluster_id = line.split("\t")
-		query = clusters[cluster_id]
-		if query.nil?
-			clusters[cluster_id] = [pat_id]
-		else
-			query << pat_id
-		end
-	end
-	return clusters
-end
-def load_gene_data(gene_data_path)
-	gene_list = {} #geneID => attr
-	gene_location = {} # chr => gene
-	infile = open(gene_data_path)
-	gz = Zlib::GzipReader.new(infile)
-	current_chr = nil
-	genes = []
-	gz.each_line do |line|
-		line.chomp!
-		next if line =~ /^#/
-		fields = line.split("\t")
-		if fields[8].include?('genome=chromosome')
-			chr = fields[8].split(';')[1].split('=').last
-			gene_location[current_chr] = genes
-			genes = []
-			current_chr = chr
-		elsif fields[2] == 'gene'
-			attributes = {}
-			fields[8].split(';').each do |pair|
-				key, value = pair.split('=')
-				attributes[key] = value
-			end
-			geneName = nil
-			geneName = attributes['gene'] if !attributes['gene'].nil?
-			geneSyns = []
-			geneSyns = attributes['gene_synonym'].split(',') if !attributes['gene_synonym'].nil?
-			description = attributes['description']
-			description = URI.unescape(description) if !description.nil?
-			attributes['Dbxref'] =~ /GeneID:(\d+)/
-			gene_list[$1] = [geneName, geneSyns, description]
-			genes << [$1, fields[3].to_i, fields[4].to_i]
-		end
-	end
-	gene_location[current_chr] = genes
-	return gene_list, gene_location
-end
-def parse_kegg_data(query_genes)
-	kegg_data = {} #gene => attb
-    while !query_genes.empty?
-    	gene_set = query_genes.shift(10)
-	    url = "http://rest.kegg.jp/get/#{gene_set.map{|qg| "hsa:#{qg}"}.join('+')}"
-	    uri = URI(url)
-	    response = Net::HTTP.get(uri)
-		geneID = nil
-		gene_names = []
-		definition = nil
-		pathways = []
-		parsing_pathway_field = false
-		response.squeeze(' ').each_line do |line|
-			line.chomp!
-			if line =~ /^ENTRY/
-				geneID = line.split(' ')[1]
-			elsif line =~ /^NAME/
-				gene_names = line.split(' ', 2).last.split(', ')
-			elsif line =~ /^DEFINITION/
-				definition = line.split(' ', 2)[1]
-			elsif line =~ /^PATHWAY/
-				pathways << line.split(' ', 3)[1..2]
-				parsing_pathway_field = true
-			elsif line =~ /^BRITE/ || line =~ /^POSITION/ || line =~ /^DISEASE/ || line =~ /^MODULE/ || line =~ /^DRUG_TARGET/ || line =~ /^NETWORK/
-				parsing_pathway_field = false
-			elsif parsing_pathway_field
-				pathways << line.strip.split(' ', 2)
-			elsif line == '///'
-				parsing_pathway_field = false
-				kegg_data[geneID] = [gene_names, definition, pathways]
-				pathways = []
-				gene_names = []
-			end
-		end
-	end
-	return kegg_data
-end
 def parse_kegg_from_biosystems(biosystems_gene_path, biosystems_info_path)
 	kegg_data = {}
@@ -270,21 +106,6 @@ def merge_genes_with_kegg_data(gene_list, kegg_data)
 	return merged_data
 end
-def write_compressed_plain_file(data, path)
-	File.open(path, 'w') do |f|
-		gz = Zlib::GzipWriter.new(f)
-		gz.write data.to_json
-		gz.close
-	end
-end
-def read_compressed_json(path)
-	infile = open(path)
-	gz = Zlib::GzipReader.new(infile)
-	object = JSON.parse(gz.read)
-	return object
-end
 def compute_pathway_enrichment(genes_clusters, genes_with_kegg)
 	pathways_genes_in_predictions = {}
 	genes_in_predictions = []
@@ -358,138 +179,8 @@ def binom(n,k)
 	end
 end
-def get_reference(genomic_ranges)
-	#genomic_ranges = [patientID, mut_start, mut_stop]
-	reference = []
-	reference.concat(genomic_ranges.map{|gr| gr[1]})# get start
-	reference.concat(genomic_ranges.map{|gr| gr[2]})# get stop
-	reference.uniq!
-	reference.sort!
-	#Define overlap range
-	final_reference = []
-	reference.each_with_index do |coord,i|
-		next_coord = reference[i + 1]
-		final_reference << [coord, next_coord] if !next_coord.nil?
-	end
-	return final_reference
-end
-def overlap_patients(genomic_ranges, reference)
-	overlaps = []
-	reference.each do |start, stop|
-		patients = []
-		genomic_ranges.each do |pt_id, pt_start, pt_stop|
-			if (start <= pt_start && stop >= pt_stop) ||
-				(start > pt_start && stop < pt_stop) ||
-				(stop > pt_start && stop <= pt_stop) ||
-				(start >= pt_start && start < pt_stop)
-				patients << pt_id
-			end
-		end
-		overlaps << patients.uniq
-	end
-	return overlaps
-end
-def generate_cluster_regions(patients_genomic_region_by_chr, mutation_type, pat_per_reg = 1)
-	patients_out_of_cluster = 0
-	patients_by_cluster = {}
-	sors = []
-	patients_genomic_region_by_chr.each do |chrm, genomic_ranges|
-		reference = get_reference(genomic_ranges) # Get putative overlap regions
-		overlapping_patients = overlap_patients(genomic_ranges, reference) # See what patient has match with a overlap region
-		clust_number = 1
-		reference.each_with_index do |ref, i|
-			current_patients = overlapping_patients[i]
-			if current_patients.length > pat_per_reg
-				ref << chrm
-				node_identifier = "#{chrm}.#{clust_number}.#{mutation_type}.#{current_patients.length}"
-				ref << node_identifier
-				save_sor(current_patients, node_identifier, patients_by_cluster)
-				sors << ref
-				clust_number += 1
-			end
-		end
-	end
-	return patients_by_cluster, sors
-end
-def save_sor(current_patients, node_identifier, patients_by_cluster)
-	current_patients.each do |patient|
-		add_record(patients_by_cluster, patient, node_identifier)
-	end
-end
-def add_record(hash, key, record)
-	query = hash[key]
-	if query.nil?
-		hash[key] = [record]
-	elsif !query.include?(record)
-		query << record
-	end
-end
-def load_patient_cohort(options)
-	patient_data = {}
-	count = 0
-	fields2extract = get_fields2extract(options)
-	field_numbers = fields2extract.values
-	  File.open(options[:input_file]).each do |line|
-	    line.chomp!
-	    if options[:header] && count == 0
-	      line.gsub!(/#\s*/,'') # correct comment like  headers
-	      field_names = line.split("\t")
-	      get_field_numbers2extract(field_names, fields2extract)
-	      field_numbers = fields2extract.values
-	    else
-	      fields = line.split("\t")
-	      pat_record = field_numbers.map{|n| fields[n]}
-	      if fields2extract[:pat_id_col].nil?
-	        pat_id = "pat_#{count}" #generate ids
-	      else
-	        original_id = pat_record.shift
-	        pat_id = original_id + "_i#{count}" # make sure that ids are uniq
-	      end
-	      if !pat_record[0].nil?
-	      	pat_record[0] = pat_record[0].split(options[:hpo_separator])
-	      else
-	      	pat_record[0] = []
-	      end
-	      pat_record[2] = pat_record[2].to_i if !options[:start_col].nil?
-	      pat_record[3] = pat_record[3].to_i if !options[:end_col].nil?
-	      patient_data[pat_id] = pat_record
-	    end
-	    count +=1
-	  end
-	  options[:pat_id_col] = 'generated' if fields2extract[:pat_id_col].nil?
-	  return patient_data
-end
-def get_fields2extract(options)
-	fields2extract = {}
-	[:pat_id_col, :hpo_col, :chromosome_col, :start_col, :end_col].each do |field|
-		col = options[field]
-		if !col.nil?
-			col = col.to_i if !options[:header]
-			fields2extract[field] = col
-		end
-	end
-	return fields2extract
-end
-def get_field_numbers2extract(field_names, fields2extract)
-  fields2extract.each do |field, name|
-    fields2extract[field] = field_names.index(name)
-  end
-end
-def download(ftp_server, path, name)
-  ftp = Net::FTP.new()
-  ftp.connect(ftp_server)
-  ftp.login
-  ftp.getbinaryfile(path, name)
-  ftp.close
-end
 def get_and_parse_external_data(all_paths)
 	sources = [
@@ -552,8 +243,27 @@ def get_detailed_similarity(profile, candidates, evidences, hpo)
 	return matrix
 end
-def get_similarity_matrix(reference_prof, similarities, evidence_profiles, hpo, term_limit, candidate_limit)
-		candidates = similarities.to_a.sort{|s1, s2| s2.last <=> s1.last}.first(candidate_limit)
+def get_similarity_matrix(reference_prof, similarities, evidence_profiles, hpo, term_limit, candidate_limit, other_scores = {}, id2label = {})
+		candidates = similarities.to_a
+		if other_scores.empty?
+			candidates.sort!{|s1, s2| s2.last <=> s1.last}
+			candidates = candidates.first(candidate_limit)
+		else # Prioritize first by the external list of scores, select the candidates and then rioritize by similarities
+			selected_candidates = []
+			candidates.each do |cand|
+				cand_id = cand[0]
+				cand_lab = id2label[cand_id.to_s]
+				next if cand_lab.nil?
+				other_score = other_scores[cand_lab]
+				next if other_score.nil?
+				cand << other_score
+				selected_candidates << cand
+			end
+			selected_candidates.sort!{|e1, e2| e2[2] <=> e1[2]}
+			candidates = selected_candidates.first(candidate_limit)
+			candidates.sort!{|e1, e2| e2[1] <=> e1[1]}
+			candidates.each{|c| c.pop}
+		end
 		candidates_ids = candidates.map{|c| c.first}
 		candidate_similarity_matrix = get_detailed_similarity(reference_prof, candidates, evidence_profiles, hpo)
 		candidate_similarity_matrix.each_with_index do |row, i|

data/lib/pets/genomic_features.rb ADDED Viewed

@@ -0,0 +1,240 @@
+class Genomic_Feature
+	@@ref = nil
+	def self.array2genomic_feature(arr)
+		new(arr.map{|r| yield(r)})
+	end
+	def self.hash2genomic_feature(h)
+		vars = []
+		h.each do |h, v|
+			vars << yield(h, v)
+		end
+		new(vars)
+	end
+	def self.add_reference(genomic_regions)
+		@@ref = genomic_regions
+	end
+	#If any method use gen_fet as name is a Genomic_Feature object
+	def initialize(feat_array, annotations: nil) # [[chr1, start1, stop1],[chr1, start1, stop1]]
+		@regions = {}
+		@reg_by_to = {}
+		@reg_id = -1
+		load_features(feat_array)
+		load_annotations(annotations) if !annotations.nil?
+	end
+	def load_features(feat_array)
+		feat_array.each do |chr, start, stop, to|
+			chr = chr.to_sym
+			@reg_id +=1
+			id = to.nil? ? @reg_id : to
+			region = {chr: chr, start: start, stop: stop, to: id }
+			@reg_by_to[id] = region
+			add_record(@regions, chr, region)
+		end
+	end
+	def load_annotations(annotations)
+		each do |chr, reg|
+			annot = annotations[reg[:to]]
+			reg[:attrs] = annot if !annot.nil?
+		end
+	end
+	def length
+		return @regions.length
+	end
+	def each_chr()
+		@regions.each do |chr, regs|
+			yield(chr, regs)
+		end
+	end
+	def each()
+		@regions.each do |chr, regs|
+			regs.each do |region|
+				yield(chr, region)
+			end
+		end
+	end
+	def get_chr
+		return @regions.keys
+	end
+	def get_chr_regs(chr)
+		return @regions[chr]
+	end
+	def region_by_to(to)
+		return @reg_by_to[to]
+	end
+	def get_sizes
+		sizes = []
+		each do |chr, region|
+			size = region[:stop] - region[:start] + 1
+			sizes << size
+		end
+		return sizes
+	end
+	def get_features(attr_type: nil)
+		features = match(@@ref)
+		if !attr_type.nil?
+			features.each do |reg_id, feat_ids|
+				new_feat_ids = feat_ids.map{|fi| @@ref.region_by_to(fi).dig(:attrs, attr_type)}
+				features[reg_id] = new_feat_ids.compact.uniq
+			end
+		end
+		return features
+	end
+	def match(other_gen_feat)
+		all_matches = {}
+		each_chr do |chr, regs|
+			other_regs = other_gen_feat.get_chr_regs(chr)
+			next if other_regs.nil?
+			regs.each do |reg|
+				local_matches = []
+				start = reg[:start]
+				stop = reg[:stop]
+				other_regs.each do |other_reg|
+					local_matches << other_reg[:to] if coor_overlap?(start, stop, other_reg)
+				end
+				all_matches[reg[:to]] = local_matches if !local_matches.empty?
+			end
+		end
+		return all_matches
+	end
+	def get_summary_sizes
+		sizes = Hash.new(0)
+		each do |chr, region|
+			size = region[:stop] - region[:start] + 1
+			sizes[size] += 1
+		end
+		return sizes.to_a.sort!{|s| s[1] <=> s[1] }
+	end
+	def merge(gen_fet, to = nil) # 'to' the regions must be connected "to" given id
+		gen_fet.each do |chr, region|
+			to.nil? ? region[:to] = @reg_id +=1 : region[:to] = to # handle id custom or default
+			add_record(@regions, chr, region)
+		end
+	end
+	def get_reference_overlaps(genomic_ranges, reference)
+		overlaps = []
+		reference.each do |start, stop|
+			reg_ids = []
+			genomic_ranges.each do |reg|
+				overlap = coor_overlap?(start, stop, reg)
+				reg_ids << reg[:to] if overlap
+			end
+			overlaps << reg_ids.uniq
+		end
+		return overlaps
+	end
+	def generate_cluster_regions(meth, tag, ids_per_reg = 1, obj = false)
+		compute_windows(meth) # Get putative genome windows
+		ids_by_cluster = {}
+		annotated_full_ref = [] # All reference windows wit uniq id and chr tagged
+		@regions.each do |chr, regs|
+			reference = @windows[chr]
+			overlaps = get_reference_overlaps(regs, reference)
+			clust_numb = 0
+			reference.each_with_index do |ref, i|
+				current_ids = overlaps[i]
+				if current_ids.length > ids_per_reg
+					clust_id = "#{chr}.#{clust_numb +=1}.#{tag}.#{current_ids.length}"
+					current_ids.each do |curr_id|
+						add_record(ids_by_cluster, curr_id, clust_id, true)
+					end
+					annotated_full_ref << ref.dup.concat([chr, clust_id])
+				end
+			end
+		end
+		annotated_full_ref = Genomic_Feature.array2genomic_feature(annotated_full_ref){|r| [r[2], r[0], r[1], r[3]]} if obj
+		return ids_by_cluster, annotated_full_ref
+	end
+	def compute_windows(meth)
+		@windows = {}
+		@regions.each do |chr, regs|
+			chr_windows = nil
+			if meth == :reg_overlap
+				chr_windows = compute_region_overlap_windows(regs)
+			end
+			@windows[chr] = chr_windows
+		end
+	end
+	private
+	def add_record(hash, key, record, uniq=false)
+		query = hash[key]
+		if query.nil?
+			hash[key] = [record]
+		elsif !uniq # We not take care by repeated entries
+			query << record
+		elsif !query.include?(record) # We want uniq entries
+			query << record
+		end
+	end
+	def compute_region_overlap_windows(genomic_ranges)
+		reference = []
+		single_nt = []
+		genomic_ranges.each do |gr|
+			start = gr[:start]
+			stop = gr[:stop]
+			if stop - start > 0
+				reference << start # get start
+				reference << stop # get stop
+			else # Build a window of at least one nt for snv
+				single_nt << start
+			end
+		end
+		reference.uniq!
+		single_nt.each do |snt| # add start stop for snv
+			reference << snt
+			reference << snt
+		end
+		reference.sort!
+		#Define overlap ranges
+		final_reference = []
+		last_len = 1
+		reference.each_with_index do |coord,i|
+			next_coord = reference[i + 1]
+			if !next_coord.nil?
+				current_len = next_coord - coord
+				coord = coord + 1 if last_len == 0 # Separate SNV window from others
+				if current_len == 0 && last_len > 0 && !final_reference.empty?
+					final_reference.last[1] -= 1 # Separate SNV window from others
+				end
+				final_reference << [coord, next_coord]
+				last_len = current_len
+			end
+		end
+		return final_reference
+	end
+	def coor_overlap?(start, stop, reg)
+		overlap = false
+		reg_start = reg[:start]
+		reg_stop = reg[:stop]
+		if (start <= reg_start && stop >= reg_stop) ||
+			(start > reg_start && stop < reg_stop) ||
+			(stop > reg_start && stop <= reg_stop) ||
+			(start >= reg_start && start < reg_stop)
+			overlap = true
+		end
+		return overlap
+	end
+end