RubyGems - pets - Versions diffs - 0.1.2 - Mend

pets 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (49) hide show

checksums.yaml +7 -0
data/.gitignore +11 -0
data/.rspec +3 -0
data/.travis.yml +7 -0
data/Gemfile +4 -0
data/LICENSE.txt +21 -0
data/README.md +41 -0
data/Rakefile +6 -0
data/bin/area_under_curve_pr.rb +118 -0
data/bin/association_metrics_average.rb +94 -0
data/bin/coPatReporter.rb +531 -0
data/bin/console +14 -0
data/bin/fmeasure_index.rb +72 -0
data/bin/get_PR_values.rb +90 -0
data/bin/get_clusters.R +18 -0
data/bin/get_network_nodes.rb +197 -0
data/bin/lines.R +77 -0
data/bin/merge_by_cluster.rb +62 -0
data/bin/merge_pairs.rb +138 -0
data/bin/paco_translator.rb +102 -0
data/bin/phen2reg.rb +385 -0
data/bin/phen2reg_predictor_check.rb +297 -0
data/bin/plot_area.R +71 -0
data/bin/plot_boxplot.R +21 -0
data/bin/plot_density.R +46 -0
data/bin/plot_scatterplot.R +25 -0
data/bin/reg2phen.rb +116 -0
data/bin/region_to_patients_generator.rb +84 -0
data/bin/relate_CI_to_association_value.rb +90 -0
data/bin/setup +8 -0
data/bin/standardize_scores.R +40 -0
data/bin/xyplot_graph.R +60 -0
data/external_data/biosystems_gene.gz +0 -0
data/external_data/bsid2info.gz +0 -0
data/external_data/chromosome_sizes_hg19.txt +24 -0
data/external_data/gene_data.gz +0 -0
data/external_data/gene_data_with_pathways.gz +0 -0
data/external_data/gene_location.gz +0 -0
data/external_data/hp.obo +146363 -0
data/external_data/remove +0 -0
data/lib/pets.rb +6 -0
data/lib/pets/coPatReporterMethods.rb +77 -0
data/lib/pets/generalMethods.rb +556 -0
data/lib/pets/phen2reg_methods.rb +432 -0
data/lib/pets/version.rb +3 -0
data/pets.gemspec +47 -0
data/templates/cohort_report.erb +93 -0
data/templates/patient_report.erb +209 -0
metadata +183 -0

data/bin/phen2reg_predictor_check.rb ADDED Viewed

@@ -0,0 +1,297 @@
+#! /usr/bin/env ruby
+##########################
+#LIBRARIES
+##########################
+require 'optparse'
+##########################
+#METHODS
+##########################
+@success_percentage_distribution = []
+@prediction_vector = []
+@rankings = { :in => [], :out => []}
+@genome_fraction_predicted = 0 #All positive cases
+@good_predicted_subregions = 0 #True positive cases
+def compute_rankings
+	if !@prediction_vector.empty?
+		n_preds = @prediction_vector.length.fdiv(100)
+		@prediction_vector.each_with_index do |in_control, i|
+			ranking = (i + 1).fdiv(n_preds)
+			if in_control
+				@rankings[:in] << ranking
+			else
+				@rankings[:out] << ranking
+			end
+		end
+		@prediction_vector = []
+	end
+end
+def load_prediction_file(input_file)
+	predicted_regions = []
+	File.open(input_file).each do |line|
+		line.chomp!
+		predicted_info = line.split("\t")
+		profile_index = predicted_info[0].gsub('ProfID:','').to_i
+		if predicted_info[1] != 'Results not found'
+			predicted_hpos_number = predicted_info[4].split(',').length
+			predicted_regions << [profile_index, predicted_info[1], predicted_info[2].to_i, predicted_info[3].to_i, predicted_info[6].to_f, predicted_hpos_number]
+		else
+			predicted_regions << [profile_index]
+		end
+	end
+	return predicted_regions # profile_index, pred_chr, pred_start, pred_stop, score
+end
+def get_imputation_scores(predicted_regions, integration_method)
+	# predicted_regions.map{|pred_reg| pred_reg.pop}
+	# STDERR.puts predicted_regions.inspect
+	selected_regions = predicted_regions.select{|r| r[4].class == Float }
+	score_regionLength_pairs = selected_regions.map{|r| [r[4], r[3] - r[2]]} #Get combined score and region length
+	score_regionLength_pairs.sort!{|p1, p2| p1.first <=> p2.first}
+	score_regionLength_pairs.reverse! if integration_method == 'fisher'
+	total_region_length = score_regionLength_pairs.map{|p| p.last }.inject{|sum, l| sum + l }
+	length2inspect = total_region_length/1000
+	acumulated_score = 0
+	inspected_length = 0
+	score_regionLength_pairs.each do |score, length|
+		acumulated_score += score * length
+		inspected_length += length
+		break if inspected_length >= length2inspect
+	end
+	return acumulated_score.fdiv(inspected_length)
+end
+def generate_random_imp_score(imputation_score, desv)
+	range = imputation_score * desv * rand()
+	if [true, false].sample
+		final_score = imputation_score - range
+	else
+		final_score = imputation_score + range
+	end
+	return final_score
+end
+def load_patient_data(input_data_file)
+	patient_data = []
+	File.open(input_data_file).each do |line|
+		line.chomp!
+		mutation_coords, hpo_profile = line.split("\t")
+		number_of_phenotypes = hpo_profile.split('|').length
+		chr, start_pos, stop_pos = mutation_coords.split(':')
+		patient_data << [chr, start_pos.to_i, stop_pos.to_i, number_of_phenotypes]
+	end
+	return patient_data #ctrl_chr, ctrl_start, ctrl_stop, #number_of_phens
+end
+def get_perfomance_table(ctrl_regions, predicted_regions, scale, imputation_score, hpo_min_recovery, apply_imputation)
+	table = []
+	last_profile_id = ctrl_chr = ctrl_start = ctrl_stop = predicted_hpos_number = number_of_phenotypes = nil
+	in_out_regions = []
+	predicted_regions.each do |profile_index, pred_chr, pred_start, pred_stop, score, predicted_hpos_number|
+		if last_profile_id != profile_index && !last_profile_id.nil?
+			table.concat(process_in_out_regions(ctrl_start, ctrl_stop, scale, in_out_regions, imputation_score, apply_imputation))
+			@genome_fraction_predicted = 0
+			@good_predicted_subregions = 0
+			in_out_regions = []
+			compute_rankings
+		end
+		ctrl_chr, ctrl_start, ctrl_stop, number_of_phenotypes = ctrl_regions[profile_index] #get position in array, for each prediction
+		unless predicted_hpos_number.nil? || number_of_phenotypes.nil?
+			hpo_recovery_percentage = ( predicted_hpos_number / number_of_phenotypes.to_f ) * 100
+			#STDERR.puts "#{predicted_hpos_number}\t#{number_of_phenotypes}"
+			if hpo_recovery_percentage > hpo_min_recovery
+				in_out_regions.concat(get_in_out_regions(ctrl_chr, ctrl_start, ctrl_stop, pred_chr, pred_start, pred_stop, score))
+				last_profile_id = profile_index
+			end
+		end
+	end
+	table.concat(process_in_out_regions(ctrl_start, ctrl_stop, scale, in_out_regions, imputation_score, apply_imputation))
+	return table
+end
+def process_in_out_regions(ctrl_start, ctrl_stop, scale, in_out_regions, imputation_score, apply_imputation)
+	@success_percentage_distribution << get_sucess_percentage(in_out_regions)
+	table = []
+	ctrl_length = ctrl_stop - ctrl_start
+	non_predicted_regions = ctrl_length - @good_predicted_subregions
+	if non_predicted_regions > 0
+		#total_predicted_region_length = in_out_regions.map{|s| s.last}.inject(0){|i, sum| i + sum}
+		#imputation_score = in_out_regions.map{|s| s[1] * s.last}.inject(0){|i, sum| i + sum}.fdiv(total_predicted_region_length)
+		#imputation_score += 0.25 * imputation_score
+		#imputation_score = in_out_regions.map{|s| s[1]}.max
+		#index = (9 * in_out_regions.length).fdiv(10).ceil
+		#imputation_score = in_out_regions.map{|s| s[1]}.sort[index]
+		#in_out_regions << ["in", generate_random_imp_score(0.764, 0.35), non_predicted_regions] if apply_imputation
+		in_out_regions << ["in", generate_random_imp_score(imputation_score, 0.35), non_predicted_regions] if apply_imputation
+	end
+	evaluated_genome_fraction = @genome_fraction_predicted + ctrl_length
+	in_out_regions.each do |group, score, region_length|
+		list_entries = (region_length.fdiv(evaluated_genome_fraction) * scale).ceil
+		table.concat(Array.new(list_entries, [group, score]))
+	end
+	return table
+end
+def get_in_out_regions(ctrl_chr, ctrl_start, ctrl_stop, pred_chr, pred_start, pred_stop, score)
+	in_out_regions = []
+	if ctrl_chr == pred_chr
+		if pred_start < ctrl_start && pred_stop > ctrl_stop # predicted region larger than ctrl region
+			region_length = ctrl_stop - ctrl_start
+			in_out_regions << ["in", score, region_length]
+			@good_predicted_subregions += region_length
+			region_length = ctrl_start - pred_start
+			in_out_regions << ["out", score, region_length]
+			@genome_fraction_predicted += region_length
+			region_length = pred_stop - ctrl_stop
+			in_out_regions << ["out", score, region_length]
+			@genome_fraction_predicted += region_length
+		elsif pred_start >= ctrl_start && pred_stop <= ctrl_stop #within ctrl region
+			region_length = pred_stop - pred_start
+			in_out_regions << ["in", score, region_length]
+			@good_predicted_subregions += region_length
+		elsif ctrl_start < pred_stop && ctrl_stop >= pred_stop #upstream region out of ctrl region
+			region_length = pred_stop - ctrl_start
+			in_out_regions << ["in", score, region_length]
+			@good_predicted_subregions += region_length
+			region_length = ctrl_start - pred_start
+			in_out_regions << ["out", score, region_length]
+			@genome_fraction_predicted += region_length
+		elsif ctrl_start <= pred_start && ctrl_stop > pred_start #downstream region out of ctrl region
+			region_length = ctrl_stop - pred_start
+			in_out_regions << ["in", score, region_length]
+			@good_predicted_subregions += region_length
+			region_length = pred_stop - ctrl_stop
+			in_out_regions << ["out", score, region_length]
+			@genome_fraction_predicted += region_length
+		else #in same chr but not in ctrl region
+			region_length = pred_stop - pred_start
+			in_out_regions << ["out", score, region_length]
+			@genome_fraction_predicted += region_length
+		end
+	elsif !pred_chr.nil? #in different chr
+		region_length = pred_stop - pred_start
+		in_out_regions << ["out", score, region_length]
+		@genome_fraction_predicted += region_length
+	end
+	if in_out_regions.map{|reg| reg.first }.include?('in')
+		@prediction_vector << true
+	else
+		@prediction_vector << false
+	end
+	return in_out_regions
+end
+def get_sucess_percentage(in_out_regions)
+	percentage = 0
+	if !in_out_regions.empty?
+		count = 0
+		total = 0
+		in_out_regions.each do |group, score, reg_length|
+			count += reg_length if group == 'in'
+			total += reg_length
+		end
+		percentage = count.fdiv(total)
+	end
+	return percentage
+end
+##########################
+#OPT-PARSE
+##########################
+options = {}
+OptionParser.new do |opts|
+  opts.banner = "Usage: #{__FILE__} [options]"
+  options[:input_prediction] = nil
+  opts.on("-i", "--input_prediction PATH", "Input prediction file for checking") do |input_prediction|
+    options[:input_prediction] = input_prediction
+  end
+  options[:meth] = nil
+  opts.on("-m", "--meth STRING", "Method used in score integration calculation, affects to the imputation algorithm (if used)") do |meth|
+    options[:meth] = meth
+  end
+  options[:output_file] = 'final_values_for_pr_curve.txt'
+  opts.on("-o", "--output_file PATH", "Output results for PR curve") do |output_file|
+    options[:output_file] = output_file
+  end
+  options[:hpo_recovery] = 0
+  opts.on("-p", "--hpo_recovery INTEGER", "Minimum percentage of HPO terms to consider predictions") do |hpo_recovery|
+    options[:hpo_recovery] = hpo_recovery.to_f
+    abort("Please, choose a recovery value higher than 0") if options[:hpo_recovery] <= 0
+  end
+  options[:input_regions] = nil
+  opts.on("-r", "--input_regions PATH", "Input patients true affected regions (ctrl file)") do |input_regions|
+  	options[:input_regions] = input_regions
+  end
+  options[:success_percentage] = 'success_percentage'
+  opts.on("-s", "--success_percentage PATH", "Output results with success percentage for each prediction") do |success_percentage|
+    options[:success_percentage] = success_percentage
+  end
+  options[:apply_imputation] = false
+  opts.on("-y", "--apply_imputation", "Activates imputation") do
+    options[:apply_imputation] = true
+  end
+  options[:scale_size] = 100
+  opts.on("-z", "--scale_size INTEGER", "Scale region size to avoid long range regions") do |scale_size|
+    options[:scale_size] = scale_size.to_i
+    abort("Please, choose a scale value higher than 0") if options[:scale_size] <= 0
+  end
+  opts.on_tail("-h", "--help", "Show this message") do
+    puts opts
+    exit
+  end
+end.parse!
+##########################
+#MAIN
+##########################
+predicted_regions = load_prediction_file(options[:input_prediction])
+imputation_score = get_imputation_scores(predicted_regions, options[:meth])
+patient_data = load_patient_data(options[:input_regions])
+table = get_perfomance_table(patient_data, predicted_regions, options[:scale_size], imputation_score, options[:hpo_recovery], options[:apply_imputation])
+File.open(options[:output_file], 'w') do |f|
+	f.puts "group\tscore"
+	table.each do |output, score|
+		if options[:apply_imputation]
+			#score = generate_random_imp_score(0.764, 0.35) if score.nil?
+			score = generate_random_imp_score(imputation_score, 0.35) if score.nil?
+		else
+			next if score.nil? #when no imputation
+		end
+		f.puts "#{output}\t#{score}"
+	end
+end
+File.open(options[:success_percentage], 'w') do |f|
+	f.puts 'perc'
+	@success_percentage_distribution.each do |pg|
+		f.puts pg
+	end
+end
+File.open('ranking', 'w') do |f|
+	f.puts "reg\tranking"
+	@rankings.each do |reg, ranks|
+		ranks.each do |rank|
+			f.puts "#{reg.to_s}\t#{rank}"
+		end
+	end
+end

data/bin/plot_area.R ADDED Viewed

@@ -0,0 +1,71 @@
+#! /usr/bin/env Rscript
+# x,y graph
+library(ggplot2)
+library(optparse)
+################################################################
+# OPTPARSE
+################################################################
+option_list <- list(
+	make_option(c("-d", "--data_file"), type="character",
+		help="Tabulated file with information about each sample"),
+	make_option(c("-o", "--output"), type="character", default="results",
+		help="Output figure file"),
+	make_option(c("-x", "--x_values"), type="character",
+		help="Name of column with values to be plotted"),
+	make_option(c("-y", "--y_values"), type="character",
+		help="Name of column with values to be plotted"),
+	make_option(c("-f", "--density_values"), type="character",
+		help="Name of column to be used as density values"),
+	make_option(c("-H", "--header"), action="store_false", default=TRUE,
+        help="The input table not have header line"),
+	 make_option(c("-X", "--x_title"), type="character",
+	 	help="Name of column to be used for bars titles"),
+	make_option(c("-Y", "--y_title"), type="character",
+	 	help="Title of y axis"),
+	make_option(c("-F", "--output_format"), type="character", default="pdf",
+	 	help="pdf or jpeg file output format"),
+    make_option(c("-m", "--maxs_file"), type="character", default="",
+        help="Tabulated file maximum of each sample"),
+    make_option(c("-t", "--graph_title"), type="character", default="",
+        help="Title of the graph")
+)
+opt <- parse_args(OptionParser(option_list=option_list))
+################################################################
+## MAIN
+################################################################
+data <- read.table(opt$data_file, sep="\t", header=opt$header)
+if (opt$output_format == "pdf"){
+	pdf(paste(opt$output, '.pdf', sep=""))
+}else if(opt$output_format == "jpeg"){
+	jpeg(paste(opt$output, '.jpeg', sep=""))
+}
+	goodChrOrder <- c(1:22,"X","Y")
+	data$V1 <- factor(data$V1,levels=goodChrOrder)
+	maxs <- c()
+	if(opt$maxs_file != ""){
+		maxs <- read.table(opt$maxs_file, sep="\t", header=FALSE)
+		#print(maxs)
+	}
+	#ggplot(data=data, aes(x=data[[opt$x_values]], y=data[[opt$y_values]] ))  +
+	obj <- ggplot(data=data, aes(x=V2, y=V3 ))
+	#geom_area(aes(fill=data[[opt$density_values]], )) +
+	obj <- obj + geom_area(aes(fill=V1, ))
+	obj <- obj + facet_wrap(~ V1, ncol=2, strip.position = "right" )
+	if(length(maxs) > 0){
+		obj <- obj + geom_vline(data = maxs, aes(xintercept = V2))
+	}
+	obj <- obj + xlab(opt$x_title)
+	obj <- obj + ylab(opt$y_title)
+	obj <- obj + theme(axis.text.x = element_text(angle = 45, hjust = 1))
+	obj <- obj + guides(fill=FALSE)
+	#obj <- obj + labs(title = opt$graph_title)
+	obj <- obj + ggtitle(label = opt$graph_title)
+	obj
+dev.off()

data/bin/plot_boxplot.R ADDED Viewed

@@ -0,0 +1,21 @@
+#! /usr/bin/env Rscript
+library(ggplot2)
+args <- commandArgs(trailingOnly = TRUE)
+data <- read.table(args[1], header=TRUE)
+output <- args[2]
+x_axis <- args[3]
+y_axis <- args[4]
+x_tag <- args[5]
+y_tag <- args[6]
+x_order <- unique(data[[x_axis]])
+data[[x_axis]] <- factor(data[[x_axis]], levels = x_order)
+pdf(file.path(output, 'boxplot.pdf'))
+	ggplot(data, aes(x=data[[x_axis]], y=data[[y_axis]])) +
+		geom_boxplot() +
+		xlab(x_tag) +
+		ylab(y_tag) +
+		theme(axis.text.x = element_text(angle = 45, hjust = 1))
+dev.off()

data/bin/plot_density.R ADDED Viewed

@@ -0,0 +1,46 @@
+#! /usr/bin/env Rscript
+library(ggplot2)
+args <- commandArgs(trailingOnly = TRUE)
+file <- args[1]
+values <- args[2]
+#x_axis_limit <- strtoi(args[3])
+x_axis_limit <- as.numeric(args[3])
+groups <- args[4]
+x_axis_limit_min <- as.numeric(args[5])
+#categories <- args[3]
+#xtitle <- args[4]
+#ytitle <- args[5]
+data <- read.table(file, header = TRUE , sep="\t")
+pdf('out.pdf')
+	#ggplot(data, aes(x=data[[values]], colour=data[[categories]], fill=data[[categories]])) +
+		#geom_histogram(binwidth=.5, position="dodge") +
+		#geom_histogram(position="dodge") +
+		#xlab(xtitle) +
+		#ylab('Count') +
+                if(is.na(groups)){
+                        obj <- ggplot(data, aes(x=data[[values]]))
+                        obj <- obj + geom_density()
+                }else{
+                        obj <- ggplot(data, aes(x=data[[values]], fill=data[[groups]]))
+                        obj <- obj + geom_density(alpha=.3)
+                }
+		obj <- obj + theme(legend.title=element_blank())
+		if(!is.na(x_axis_limit)){
+			xmin <- 0
+			if(!is.na(x_axis_limit_min)){
+				xmin <-x_axis_limit_min
+			}
+			obj <- obj + xlim(xmin, x_axis_limit)
+		}
+		obj
+dev.off()

data/bin/plot_scatterplot.R ADDED Viewed

@@ -0,0 +1,25 @@
+#! /usr/bin/env Rscript
+library(ggplot2)
+args <- commandArgs(trailingOnly = TRUE)
+data <- read.table(args[1], header=TRUE)
+output <- args[2]
+x_axis <- args[3]
+y_axis <- args[4]
+density <- args[5]
+x_tag <- args[6]
+y_tag <- args[7]
+size_tag <- args[8]
+x_order <- unique(data[[x_axis]])
+data[[x_axis]] <- factor(data[[x_axis]], levels = x_order)
+pdf(file.path(output, 'scatterplot.pdf'))
+	ggplot(data, aes(x=data[[x_axis]], y=data[[y_axis]])) +
+		geom_point(aes(size=data[[density]])) +
+		xlab(x_tag) +
+		ylab(y_tag) +
+		labs(size = size_tag) +
+		theme(axis.text.x = element_text(angle = 45, hjust = 1))
+dev.off()