RubyGems - pets - Versions diffs - 0.2.3 → 0.2.4 - Mend

pets 0.2.3 → 0.2.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (37) hide show

checksums.yaml +4 -4
data/Gemfile +2 -0
data/README.md +79 -5
data/bin/coPatReporter.rb +63 -156
data/bin/comPatMondo.rb +1 -4
data/bin/evidence_profiler.rb +38 -151
data/bin/get_network_nodes.rb +79 -132
data/bin/get_sorted_profs.rb +25 -36
data/bin/install_deps.rb +7 -0
data/bin/paco_translator.rb +29 -72
data/bin/phen2reg.rb +1 -4
data/bin/profiles2phenopacket.rb +110 -0
data/bin/reg2phen.rb +1 -3
data/example_datasets/associations_file.txt +757 -0
data/example_datasets/example_patient.txt +6 -0
data/example_datasets/example_patient_hpos.txt +15 -0
data/example_datasets/genes.txt +8 -0
data/example_datasets/hpo2ci.txt +2798 -0
data/example_datasets/hummu_congenital_full_dataset.txt +4183 -0
data/example_datasets/launch.sh +20 -0
data/external_code/generate_boxpot.R +51 -21
data/external_code/get_clusters.R +2 -2
data/external_code/install_R_dependencies.R +11 -0
data/external_code/plot_heatmap.R +34 -30
data/lib/pets/coPatReporterMethods.rb +143 -441
data/lib/pets/cohort.rb +307 -0
data/lib/pets/constants.rb +7 -0
data/lib/pets/generalMethods.rb +8 -317
data/lib/pets/genomic_features.rb +144 -0
data/lib/pets/io.rb +457 -0
data/lib/pets/parsers/cohort_parser.rb +106 -0
data/lib/pets/version.rb +1 -1
data/lib/pets.rb +8 -0
data/pets.gemspec +1 -0
data/templates/cohort_report.erb +5 -7
data/templates/patient_report.erb +1 -1
metadata +34 -3

data/example_datasets/launch.sh ADDED Viewed

@@ -0,0 +1,20 @@
+#! /usr/bin/env bash
+current=`pwd`
+hpo_file=$current/../external_data/hp.obo
+#Launch Cohort Analyzer
+mkdir -p $current/cohort_analyzer_results
+cd $current/cohort_analyzer_results
+	coPatReporter.rb -i $current/hummu_congenital_full_dataset.txt -o $current/cohort_analyzer_results -p phenotypes -c chr -d patient_id -s start -e stop -m lin
+cd..
+# Launch Reg2Phen
+mkdir -p $current/reg2phen_results
+cd $current/reg2phen_results
+	reg2phen.rb -t $current/associations_file.txt -p $current/genes.txt -b $hpo_file -P -g -H -o $current/results/patient1Genes.txt -F $current/reg2phen_results/patient1Genes.html
+cd ..
+# Launch Phen2Reg
+mkdir -p $current/phen2reg_results
+cd $current/phen2reg_results
+	phen2reg.rb -t $current/associations_file.txt -M 50 -p $current/example_patient_hpos.txt -k -y 0 -d prednum -i $current/hpo2ci.txt -r fisher -f $hpo_file -P 0.05 -b 1 -m -T -Q > $current/phen2reg_results/single_phens.txt
+cd ..

data/external_code/generate_boxpot.R CHANGED Viewed

@@ -1,18 +1,37 @@
 #!/usr/bin/env Rscript
 suppressMessages(library(dplyr))
-load_file <- function(file_path, cluster_sim_out = NULL){
+#####################
+## FUNCTIONS
+#####################
+load_file <- function(file_path, cluster_sim_out = NULL, sim_method = 'lin'){
 	# sim_matrix <- read.table(file = file.path(file_path), sep = "\t", stringsAsFactors = FALSE, header = FALSE)
-	sim_matrix <- RcppCNPy::npyLoad(file.path(file_path, "similarity_matrix_lin.npy"))
-	axis_labels <- read.table(file.path(file_path, "similarity_matrix_lin.lst"), header=FALSE, stringsAsFactors=FALSE)
- 	colnames(sim_matrix) <- axis_labels$V1
- 	rownames(sim_matrix) <- axis_labels$V1
- 	diag(sim_matrix) <- NA
+	file_name <- file.path(file_path,paste0('similarity_matrix_',sim_method,'.npy'))
+	sim_matrix <- RcppCNPy::npyLoad(file_name)
+	file_name <- file.path(file_path,paste0('similarity_matrix_',sim_method,'.lst'))
+	if(file.exists(file_name)){ # squared matrix
- 	groups <- read.table(file.path(file_path, "lin_clusters.txt"), header=FALSE)
+		axis_labels <- read.table(file_name, header=FALSE, stringsAsFactors=FALSE, sep="\t")
+	 	colnames(sim_matrix) <- axis_labels$V1
+	 	rownames(sim_matrix) <- axis_labels$V1
+	 	diag(sim_matrix) <- NA
+		file_name <- paste0(sim_method,'_clusters.txt')
+		split_mode = "byboth"
+	}else{ # rectangular matrix
+		axis_labels_x <- read.table(file.path(file_path, paste0('similarity_matrix_',sim_method,'_x.lst')), header=FALSE, stringsAsFactors=FALSE, sep="\t")
+		axis_labels_y <- read.table(file.path(file_path, paste0('similarity_matrix_',sim_method,'_y.lst')), header=FALSE, stringsAsFactors=FALSE, sep="\t")
+	 	colnames(sim_matrix) <- axis_labels_x$V1
+	 	rownames(sim_matrix) <- axis_labels_y$V1
+		file_name <- paste0(sim_method,'_clusters_rows.txt')
+		split_mode = "byrows"
+	}
+ 	groups <- read.table(file.path(file_path, file_name), header=FALSE, sep="\t")
  	groups_vec <- groups[,2]
 	names(groups_vec) <- groups[,1]
- 	sim_within_groups <- calc_sim_within_groups(sim_matrix, groups_vec)
+ 	sim_within_groups <- calc_sim_within_groups(sim_matrix, groups_vec, split_mode = split_mode)
  	if (!is.null(cluster_sim_out))
  	write.table(sim_within_groups, cluster_sim_out, quote=FALSE, row.names=TRUE, sep="\t", col.names = FALSE)
 	sim_matrix <- sim_matrix %>% as.data.frame %>% tibble::rownames_to_column() %>%
@@ -26,24 +45,34 @@ load_file <- function(file_path, cluster_sim_out = NULL){
 }
-get_group_submatrix_mean <- function(group, matrix_transf, groups=groups) {
-  mean(matrix_transf[
-		names(groups)[groups %in% group],
-		names(groups)[groups %in% group]
-      ], na.rm=TRUE
-  )
+get_group_submatrix_mean <- function(group, matrix_transf, groups=groups, split_mode = "byboth") {
+	submatrix <- matrix_transf
+	if (split_mode %in% c("byboth", "bycols")){
+		submatrix <- submatrix[,names(groups)[groups %in% group]]
+	}
+	if (split_mode %in% c("byboth", "byrows")){
+		submatrix <- submatrix[names(groups)[groups %in% group],]
+	}
+  mean(submatrix, na.rm=TRUE)
 }
-calc_sim_within_groups <- function(matrix_transf, groups) {
+calc_sim_within_groups <- function(matrix_transf, groups, split_mode = "byboth") {
 	unique_groups <- unique(groups)
-	group_mean_sim <- sapply(unique_groups, get_group_submatrix_mean, matrix_transf=matrix_transf, groups=groups)
+	group_mean_sim <- sapply(unique_groups, get_group_submatrix_mean, matrix_transf=matrix_transf, groups=groups, split_mode = split_mode)
 	names(group_mean_sim) <- unique_groups
 	group_mean_sim
 }
+#####################
+## OPTPARSE
+#####################
 option_list <- list(
   optparse::make_option(c("-i", "--input_paths"), type="character", default=NULL,
-    help="Path to Npy and names."),
+    help="Path to Npy and names"),
+  optparse::make_option(c("-m", "--sim_method"), type="character", default='lin',
+    help="Similarity method"),
   optparse::make_option(c("-o", "--output_file"), type="character", default=NULL,
     help="Output graph file name"),
   optparse::make_option(c("-t", "--tags"), type="character", default=NULL,
@@ -51,6 +80,9 @@ option_list <- list(
 )
 opt <- optparse::parse_args(optparse::OptionParser(option_list=option_list))
+#####################
+## MAIN
+#####################
 all_files <- unlist(strsplit(opt$input_paths, ","))
 tags <- seq(length(all_files))
@@ -60,7 +92,7 @@ if (!is.null(opt$tags)){
 similarity_dist <- list()
 for (file_i in seq(length(all_files))) {
-	similarity_dist[[tags[file_i]]] <- load_file(all_files[file_i], cluster_sim_out = paste0(opt$output_file,"_", tags[file_i],"_cluster_sim"))
+	similarity_dist[[tags[file_i]]] <- load_file(all_files[file_i], cluster_sim_out = paste0(opt$output_file,"_", tags[file_i],"_cluster_sim"), sim_method = opt$sim_method)
 }
 similarity_dist[["enod"]] <- NULL
 for (tag in names(similarity_dist)){
@@ -83,8 +115,6 @@ pp <- ggplot2::ggplot(similarity_dist, ggplot2::aes(x = Cohort, y = Similarity,
 				   legend.position = "top",
 				   legend.title = ggplot2::element_text(size = 14),
   					legend.text = ggplot2::element_text(size = 14)) +
-	ggplot2::labs(fill = "Lin similarity")
+	ggplot2::labs(fill = paste0(opt$sim_method, " similarity"))
 ggplot2::ggsave(filename = paste0(opt$output_file,".png"),pp,width = 20, height = 18, dpi = 200, units = "cm", device='png')

data/external_code/get_clusters.R CHANGED Viewed

@@ -16,8 +16,8 @@ option_list <- list(
 opt <- parse_args(OptionParser(option_list=option_list))
 if(!is.null(opt$npy)){
-	x_axis_labels <- read.table(paste0(opt$npy, '_x.lst'), header=FALSE, stringsAsFactors=FALSE)
-	y_axis_labels <- read.table(paste0(opt$npy, '_y.lst'), header=FALSE, stringsAsFactors=FALSE)
+	x_axis_labels <- read.table(paste0(opt$npy, '_x.lst'), header=FALSE, stringsAsFactors=FALSE, sep="\t")
+	y_axis_labels <- read.table(paste0(opt$npy, '_y.lst'), header=FALSE, stringsAsFactors=FALSE, sep="\t")
 	matrix_data <- npyLoad(opt$data_file)
 	colnames(matrix_data) <- x_axis_labels$V1
 	rownames(matrix_data) <- y_axis_labels$V1

data/external_code/install_R_dependencies.R ADDED Viewed

@@ -0,0 +1,11 @@
+#! /usr/bin/env Rscript
+######################################################################################################
+####################### LIBRARY INSTALLING SCRIPT for PETS #################################
+######################################################################################################
+print("Installing libraries from CRAN")
+packages_list <-c("optparse","RcppCNPy","ggplot2","fastcluster","dplyr","gplots","RColorBrewer","tidyr","data.table","gridExtra", "dynamicTreeCut", "ggExtra", "ontologyIndex", "magrittr")
+installed <- library()$results[,1]
+packages_list <- setdiff(packages_list, installed)
+install.packages(packages_list, repos='https://cloud.r-project.org')

data/external_code/plot_heatmap.R CHANGED Viewed

@@ -141,11 +141,10 @@ if(opt$pairs){ # Load pairs
 	data_raw$SetB <- as.character(data_raw$SetB)
 	if(opt$same_sets){
 		all_elements <- sort(unique(unlist(data_raw[,c("SetA","SetB")])))
-	 	data <- matrix(0, length(all_elements), length(all_elements), dimnames = list(all_elements, all_elements))
-	 	str(data_raw)
-	 	str(data)
+	 	data <- matrix(NA, length(all_elements), length(all_elements), dimnames = list(all_elements, all_elements))
 	  	data[as.matrix(data_raw[,c("SetA","SetB")])] <- data_raw$Value
 	  	data[as.matrix(data_raw[,c("SetB","SetA")])] <- data_raw$Value
+	  	# save(data, file = "test2.RData")
 	} else {
 		rowSet <- unique(data_raw$SetA)
 		colSet <- unique(data_raw$SetB)
@@ -154,10 +153,16 @@ if(opt$pairs){ # Load pairs
 	}
 }else{ # Load matrix
 	if(!is.null(opt$npy)){
-		axis_labels <- read.table(opt$npy, header=FALSE, stringsAsFactors=FALSE)
 		data <- npyLoad(opt$data_file)
-		colnames(data) <- axis_labels$V1
+		axis_files <- unlist(strsplit(opt$npy, ','))
+		axis_labels <- read.table(axis_files[1], header=FALSE, stringsAsFactors=FALSE, sep="\t")
 		rownames(data) <- axis_labels$V1
+		if(length(axis_files) == 2){
+			x_axis_labels <- read.table(axis_files[2], header=FALSE, stringsAsFactors=FALSE, sep="\t")
+			colnames(data) <- x_axis_labels$V1
+		}else{
+			colnames(data) <- axis_labels$V1
+		}
 	}else{
 		data <- as.matrix(read.table(opt$data_file, sep="\t", header=opt$header, stringsAsFactors=FALSE, row.names= 1, check.names = FALSE))
 	}
@@ -187,32 +192,32 @@ if(opt$same_sets){
 	hr <- fastcluster::hclust(as.dist(matrix_transf), method="ward.D2")
 	groups <- cluster_obj_to_groups(matrix_transf, hr, opt$tree_cut_method, minProportionCluster = opt$minProportionCluster)
-	sim_between_groups <- calc_sim_between_groups(data, groups)
-	distance_between_groups <- 1 - sim_between_groups
-	groups_clustered <- fastcluster::hclust(as.dist(distance_between_groups), method="ward.D2")
-	# Plot dendrogram to check performance
-	# png(file=file.path(opt$output, 'dendrogram_groups.png', sep=''))
-	#    plot(dendrogram_groups)
-	# dev.off()
-	######### EXPORT
 	write.table(groups, file=paste0(opt$output, '_clusters.txt'), sep="\t", quote=FALSE, col.names=FALSE, row.names= TRUE)
 	if (opt$save_raw_clust){
 		dendrogram_groups <- as.dendrogram(hr)
-	} else {
+	} else { # TODO Pedro: I have no idea about the reason for the following code
+		sim_between_groups <- calc_sim_between_groups(data, groups)
+		distance_between_groups <- 1 - sim_between_groups
+		groups_clustered <- fastcluster::hclust(as.dist(distance_between_groups), method="ward.D2")
 		dendrogram_groups <- as.dendrogram(groups_clustered)
 	}
+	# Plot dendrogram to check performance
+	# png(file=file.path(opt$output, 'dendrogram_groups.png', sep=''))
+	#    plot(dendrogram_groups)
+	# dev.off()
 	save(dendrogram_groups, file=paste0(opt$output, '_dendrogram_groups.RData', sep=''))
 }else{
 	# Calc similitudes of rows
 	mdistRows = toDistances(matrix_transf)
 	mdistCols = toDistances(matrix_transf, FALSE)
+	rownames(mdistRows) <- rownames(matrix_transf) # The square matrixes obtained have lost
+	colnames(mdistRows) <- rownames(matrix_transf) # row and col names. We retrieve them
+	rownames(mdistCols) <- colnames(matrix_transf) # from original matrix. In previous case,
+	colnames(mdistCols) <- colnames(matrix_transf) # the matrix_transf is used directly by hclust
 	# Obtaing clustering
-	quantValue_row <- quantile(mdistRows, c(.2), na.rm = TRUE)
 	hr_row <- fastcluster::hclust(as.dist(mdistRows), method="ward.D2")
-	groups_row <- cluster_obj_to_groups(mdistRows, hr, opt$tree_cut_method)
+	groups_row <- cluster_obj_to_groups(mdistRows, hr_row, opt$tree_cut_method, minProportionCluster = opt$minProportionCluster)
 	quantValue_col <- quantile(mdistCols, c(.2), na.rm = TRUE)
 	hr_col <- fastcluster::hclust(as.dist(mdistCols), method="ward.D2")
 	groups_col <- cutree(hr_col, h = quantValue_col)
@@ -227,19 +232,18 @@ if(opt$pdf){
 }else{
 	png(paste0(opt$output, '_heatmap.png'), width = 1000, height = 1000, units = "px", res=175, pointsize = 8)
 }
-	if(opt$same_sets){
-		group_colours <- colorRampPalette(brewer.pal(8, "Set1"))(length(unique(groups)))
-		# print("cluster number:")
-		# print(length(unique(groups)))
-		# print(length(unique(group_colours)))
-		group_colours_arranged <- group_colours[groups]
-		heatmap.2(data, Rowv=as.dendrogram(hr), Colv=as.dendrogram(hr), trace="none", col=brewer.pal(11,"RdBu"), dendrogram = c("row"), labRow = FALSE, labCol = FALSE,
-					xlab = opt$collabel, ylab = opt$rowlabel, RowSideColors=group_colours_arranged)
-	}else{
-		heatmap.2(data, Rowv=as.dendrogram(hr_row), Colv=as.dendrogram(hr_col), trace="none", col=brewer.pal(11,"RdBu"), labRow = FALSE, labCol = FALSE,
-					xlab = opt$collabel, ylab = opt$rowlabel)
+if(opt$same_sets){
+	group_colours <- colorRampPalette(brewer.pal(8, "Set1"))(length(unique(groups)))
+	group_colours_arranged <- c(rep('#000000', length(groups[groups == 0])), group_colours[groups])
+	heatmap.2(data, Rowv=as.dendrogram(hr), Colv=as.dendrogram(hr), trace="none", col=brewer.pal(11,"RdBu"), dendrogram = c("row"), labRow = FALSE, labCol = FALSE,
+				xlab = opt$collabel, ylab = opt$rowlabel, RowSideColors=group_colours_arranged)
+}else{
+	group_colours <- colorRampPalette(brewer.pal(8, "Set1"))(length(unique(groups_row)))
+	group_colours_arranged <- c(rep('#000000', length(groups_row[groups_row == 0])), group_colours[groups_row])
+	heatmap.2(data, Rowv=as.dendrogram(hr_row), Colv=as.dendrogram(hr_col), trace="none", col=brewer.pal(11,"RdBu"), dendrogram = c("row"), labRow = FALSE, labCol = FALSE,
+				xlab = opt$collabel, ylab = opt$rowlabel, RowSideColors=group_colours_arranged)
-	}
+}
 dev.off()
 # save.image("test.RData")