pets 0.2.3 → 0.2.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,20 @@
1
+ #! /usr/bin/env bash
2
+
3
+ current=`pwd`
4
+ hpo_file=$current/../external_data/hp.obo
5
+
6
+ #Launch Cohort Analyzer
7
+ mkdir -p $current/cohort_analyzer_results
8
+ cd $current/cohort_analyzer_results
9
+ coPatReporter.rb -i $current/hummu_congenital_full_dataset.txt -o $current/cohort_analyzer_results -p phenotypes -c chr -d patient_id -s start -e stop -m lin
10
+ cd..
11
+ # Launch Reg2Phen
12
+ mkdir -p $current/reg2phen_results
13
+ cd $current/reg2phen_results
14
+ reg2phen.rb -t $current/associations_file.txt -p $current/genes.txt -b $hpo_file -P -g -H -o $current/results/patient1Genes.txt -F $current/reg2phen_results/patient1Genes.html
15
+ cd ..
16
+ # Launch Phen2Reg
17
+ mkdir -p $current/phen2reg_results
18
+ cd $current/phen2reg_results
19
+ phen2reg.rb -t $current/associations_file.txt -M 50 -p $current/example_patient_hpos.txt -k -y 0 -d prednum -i $current/hpo2ci.txt -r fisher -f $hpo_file -P 0.05 -b 1 -m -T -Q > $current/phen2reg_results/single_phens.txt
20
+ cd ..
@@ -1,18 +1,37 @@
1
1
  #!/usr/bin/env Rscript
2
2
  suppressMessages(library(dplyr))
3
3
 
4
- load_file <- function(file_path, cluster_sim_out = NULL){
4
+ #####################
5
+ ## FUNCTIONS
6
+ #####################
7
+
8
+ load_file <- function(file_path, cluster_sim_out = NULL, sim_method = 'lin'){
5
9
  # sim_matrix <- read.table(file = file.path(file_path), sep = "\t", stringsAsFactors = FALSE, header = FALSE)
6
- sim_matrix <- RcppCNPy::npyLoad(file.path(file_path, "similarity_matrix_lin.npy"))
7
- axis_labels <- read.table(file.path(file_path, "similarity_matrix_lin.lst"), header=FALSE, stringsAsFactors=FALSE)
8
- colnames(sim_matrix) <- axis_labels$V1
9
- rownames(sim_matrix) <- axis_labels$V1
10
- diag(sim_matrix) <- NA
10
+ file_name <- file.path(file_path,paste0('similarity_matrix_',sim_method,'.npy'))
11
+ sim_matrix <- RcppCNPy::npyLoad(file_name)
12
+ file_name <- file.path(file_path,paste0('similarity_matrix_',sim_method,'.lst'))
13
+ if(file.exists(file_name)){ # squared matrix
11
14
 
12
- groups <- read.table(file.path(file_path, "lin_clusters.txt"), header=FALSE)
15
+ axis_labels <- read.table(file_name, header=FALSE, stringsAsFactors=FALSE, sep="\t")
16
+ colnames(sim_matrix) <- axis_labels$V1
17
+ rownames(sim_matrix) <- axis_labels$V1
18
+ diag(sim_matrix) <- NA
19
+ file_name <- paste0(sim_method,'_clusters.txt')
20
+ split_mode = "byboth"
21
+
22
+ }else{ # rectangular matrix
23
+ axis_labels_x <- read.table(file.path(file_path, paste0('similarity_matrix_',sim_method,'_x.lst')), header=FALSE, stringsAsFactors=FALSE, sep="\t")
24
+ axis_labels_y <- read.table(file.path(file_path, paste0('similarity_matrix_',sim_method,'_y.lst')), header=FALSE, stringsAsFactors=FALSE, sep="\t")
25
+ colnames(sim_matrix) <- axis_labels_x$V1
26
+ rownames(sim_matrix) <- axis_labels_y$V1
27
+ file_name <- paste0(sim_method,'_clusters_rows.txt')
28
+ split_mode = "byrows"
29
+ }
30
+
31
+ groups <- read.table(file.path(file_path, file_name), header=FALSE, sep="\t")
13
32
  groups_vec <- groups[,2]
14
33
  names(groups_vec) <- groups[,1]
15
- sim_within_groups <- calc_sim_within_groups(sim_matrix, groups_vec)
34
+ sim_within_groups <- calc_sim_within_groups(sim_matrix, groups_vec, split_mode = split_mode)
16
35
  if (!is.null(cluster_sim_out))
17
36
  write.table(sim_within_groups, cluster_sim_out, quote=FALSE, row.names=TRUE, sep="\t", col.names = FALSE)
18
37
  sim_matrix <- sim_matrix %>% as.data.frame %>% tibble::rownames_to_column() %>%
@@ -26,24 +45,34 @@ load_file <- function(file_path, cluster_sim_out = NULL){
26
45
  }
27
46
 
28
47
 
29
- get_group_submatrix_mean <- function(group, matrix_transf, groups=groups) {
30
- mean(matrix_transf[
31
- names(groups)[groups %in% group],
32
- names(groups)[groups %in% group]
33
- ], na.rm=TRUE
34
- )
48
+ get_group_submatrix_mean <- function(group, matrix_transf, groups=groups, split_mode = "byboth") {
49
+ submatrix <- matrix_transf
50
+ if (split_mode %in% c("byboth", "bycols")){
51
+ submatrix <- submatrix[,names(groups)[groups %in% group]]
52
+ }
53
+
54
+ if (split_mode %in% c("byboth", "byrows")){
55
+ submatrix <- submatrix[names(groups)[groups %in% group],]
56
+ }
57
+ mean(submatrix, na.rm=TRUE)
35
58
  }
36
59
 
37
- calc_sim_within_groups <- function(matrix_transf, groups) {
60
+ calc_sim_within_groups <- function(matrix_transf, groups, split_mode = "byboth") {
38
61
  unique_groups <- unique(groups)
39
- group_mean_sim <- sapply(unique_groups, get_group_submatrix_mean, matrix_transf=matrix_transf, groups=groups)
62
+ group_mean_sim <- sapply(unique_groups, get_group_submatrix_mean, matrix_transf=matrix_transf, groups=groups, split_mode = split_mode)
40
63
  names(group_mean_sim) <- unique_groups
41
64
  group_mean_sim
42
65
  }
43
66
 
67
+ #####################
68
+ ## OPTPARSE
69
+ #####################
70
+
44
71
  option_list <- list(
45
72
  optparse::make_option(c("-i", "--input_paths"), type="character", default=NULL,
46
- help="Path to Npy and names."),
73
+ help="Path to Npy and names"),
74
+ optparse::make_option(c("-m", "--sim_method"), type="character", default='lin',
75
+ help="Similarity method"),
47
76
  optparse::make_option(c("-o", "--output_file"), type="character", default=NULL,
48
77
  help="Output graph file name"),
49
78
  optparse::make_option(c("-t", "--tags"), type="character", default=NULL,
@@ -51,6 +80,9 @@ option_list <- list(
51
80
  )
52
81
  opt <- optparse::parse_args(optparse::OptionParser(option_list=option_list))
53
82
 
83
+ #####################
84
+ ## MAIN
85
+ #####################
54
86
 
55
87
  all_files <- unlist(strsplit(opt$input_paths, ","))
56
88
  tags <- seq(length(all_files))
@@ -60,7 +92,7 @@ if (!is.null(opt$tags)){
60
92
 
61
93
  similarity_dist <- list()
62
94
  for (file_i in seq(length(all_files))) {
63
- similarity_dist[[tags[file_i]]] <- load_file(all_files[file_i], cluster_sim_out = paste0(opt$output_file,"_", tags[file_i],"_cluster_sim"))
95
+ similarity_dist[[tags[file_i]]] <- load_file(all_files[file_i], cluster_sim_out = paste0(opt$output_file,"_", tags[file_i],"_cluster_sim"), sim_method = opt$sim_method)
64
96
  }
65
97
  similarity_dist[["enod"]] <- NULL
66
98
  for (tag in names(similarity_dist)){
@@ -83,8 +115,6 @@ pp <- ggplot2::ggplot(similarity_dist, ggplot2::aes(x = Cohort, y = Similarity,
83
115
  legend.position = "top",
84
116
  legend.title = ggplot2::element_text(size = 14),
85
117
  legend.text = ggplot2::element_text(size = 14)) +
86
- ggplot2::labs(fill = "Lin similarity")
118
+ ggplot2::labs(fill = paste0(opt$sim_method, " similarity"))
87
119
 
88
120
  ggplot2::ggsave(filename = paste0(opt$output_file,".png"),pp,width = 20, height = 18, dpi = 200, units = "cm", device='png')
89
-
90
-
@@ -16,8 +16,8 @@ option_list <- list(
16
16
  opt <- parse_args(OptionParser(option_list=option_list))
17
17
 
18
18
  if(!is.null(opt$npy)){
19
- x_axis_labels <- read.table(paste0(opt$npy, '_x.lst'), header=FALSE, stringsAsFactors=FALSE)
20
- y_axis_labels <- read.table(paste0(opt$npy, '_y.lst'), header=FALSE, stringsAsFactors=FALSE)
19
+ x_axis_labels <- read.table(paste0(opt$npy, '_x.lst'), header=FALSE, stringsAsFactors=FALSE, sep="\t")
20
+ y_axis_labels <- read.table(paste0(opt$npy, '_y.lst'), header=FALSE, stringsAsFactors=FALSE, sep="\t")
21
21
  matrix_data <- npyLoad(opt$data_file)
22
22
  colnames(matrix_data) <- x_axis_labels$V1
23
23
  rownames(matrix_data) <- y_axis_labels$V1
@@ -0,0 +1,11 @@
1
+ #! /usr/bin/env Rscript
2
+
3
+ ######################################################################################################
4
+ ####################### LIBRARY INSTALLING SCRIPT for PETS #################################
5
+ ######################################################################################################
6
+ print("Installing libraries from CRAN")
7
+ packages_list <-c("optparse","RcppCNPy","ggplot2","fastcluster","dplyr","gplots","RColorBrewer","tidyr","data.table","gridExtra", "dynamicTreeCut", "ggExtra", "ontologyIndex", "magrittr")
8
+ installed <- library()$results[,1]
9
+ packages_list <- setdiff(packages_list, installed)
10
+ install.packages(packages_list, repos='https://cloud.r-project.org')
11
+
@@ -141,11 +141,10 @@ if(opt$pairs){ # Load pairs
141
141
  data_raw$SetB <- as.character(data_raw$SetB)
142
142
  if(opt$same_sets){
143
143
  all_elements <- sort(unique(unlist(data_raw[,c("SetA","SetB")])))
144
- data <- matrix(0, length(all_elements), length(all_elements), dimnames = list(all_elements, all_elements))
145
- str(data_raw)
146
- str(data)
144
+ data <- matrix(NA, length(all_elements), length(all_elements), dimnames = list(all_elements, all_elements))
147
145
  data[as.matrix(data_raw[,c("SetA","SetB")])] <- data_raw$Value
148
146
  data[as.matrix(data_raw[,c("SetB","SetA")])] <- data_raw$Value
147
+ # save(data, file = "test2.RData")
149
148
  } else {
150
149
  rowSet <- unique(data_raw$SetA)
151
150
  colSet <- unique(data_raw$SetB)
@@ -154,10 +153,16 @@ if(opt$pairs){ # Load pairs
154
153
  }
155
154
  }else{ # Load matrix
156
155
  if(!is.null(opt$npy)){
157
- axis_labels <- read.table(opt$npy, header=FALSE, stringsAsFactors=FALSE)
158
156
  data <- npyLoad(opt$data_file)
159
- colnames(data) <- axis_labels$V1
157
+ axis_files <- unlist(strsplit(opt$npy, ','))
158
+ axis_labels <- read.table(axis_files[1], header=FALSE, stringsAsFactors=FALSE, sep="\t")
160
159
  rownames(data) <- axis_labels$V1
160
+ if(length(axis_files) == 2){
161
+ x_axis_labels <- read.table(axis_files[2], header=FALSE, stringsAsFactors=FALSE, sep="\t")
162
+ colnames(data) <- x_axis_labels$V1
163
+ }else{
164
+ colnames(data) <- axis_labels$V1
165
+ }
161
166
  }else{
162
167
  data <- as.matrix(read.table(opt$data_file, sep="\t", header=opt$header, stringsAsFactors=FALSE, row.names= 1, check.names = FALSE))
163
168
  }
@@ -187,32 +192,32 @@ if(opt$same_sets){
187
192
  hr <- fastcluster::hclust(as.dist(matrix_transf), method="ward.D2")
188
193
  groups <- cluster_obj_to_groups(matrix_transf, hr, opt$tree_cut_method, minProportionCluster = opt$minProportionCluster)
189
194
 
190
- sim_between_groups <- calc_sim_between_groups(data, groups)
191
- distance_between_groups <- 1 - sim_between_groups
192
- groups_clustered <- fastcluster::hclust(as.dist(distance_between_groups), method="ward.D2")
193
-
194
- # Plot dendrogram to check performance
195
- # png(file=file.path(opt$output, 'dendrogram_groups.png', sep=''))
196
- # plot(dendrogram_groups)
197
- # dev.off()
198
- ######### EXPORT
199
195
  write.table(groups, file=paste0(opt$output, '_clusters.txt'), sep="\t", quote=FALSE, col.names=FALSE, row.names= TRUE)
200
196
  if (opt$save_raw_clust){
201
197
  dendrogram_groups <- as.dendrogram(hr)
202
- } else {
198
+ } else { # TODO Pedro: I have no idea about the reason for the following code
199
+ sim_between_groups <- calc_sim_between_groups(data, groups)
200
+ distance_between_groups <- 1 - sim_between_groups
201
+ groups_clustered <- fastcluster::hclust(as.dist(distance_between_groups), method="ward.D2")
203
202
  dendrogram_groups <- as.dendrogram(groups_clustered)
204
203
  }
204
+ # Plot dendrogram to check performance
205
+ # png(file=file.path(opt$output, 'dendrogram_groups.png', sep=''))
206
+ # plot(dendrogram_groups)
207
+ # dev.off()
205
208
  save(dendrogram_groups, file=paste0(opt$output, '_dendrogram_groups.RData', sep=''))
206
209
 
207
210
  }else{
208
211
  # Calc similitudes of rows
209
212
  mdistRows = toDistances(matrix_transf)
210
213
  mdistCols = toDistances(matrix_transf, FALSE)
214
+ rownames(mdistRows) <- rownames(matrix_transf) # The square matrixes obtained have lost
215
+ colnames(mdistRows) <- rownames(matrix_transf) # row and col names. We retrieve them
216
+ rownames(mdistCols) <- colnames(matrix_transf) # from original matrix. In previous case,
217
+ colnames(mdistCols) <- colnames(matrix_transf) # the matrix_transf is used directly by hclust
211
218
  # Obtaing clustering
212
- quantValue_row <- quantile(mdistRows, c(.2), na.rm = TRUE)
213
219
  hr_row <- fastcluster::hclust(as.dist(mdistRows), method="ward.D2")
214
- groups_row <- cluster_obj_to_groups(mdistRows, hr, opt$tree_cut_method)
215
-
220
+ groups_row <- cluster_obj_to_groups(mdistRows, hr_row, opt$tree_cut_method, minProportionCluster = opt$minProportionCluster)
216
221
  quantValue_col <- quantile(mdistCols, c(.2), na.rm = TRUE)
217
222
  hr_col <- fastcluster::hclust(as.dist(mdistCols), method="ward.D2")
218
223
  groups_col <- cutree(hr_col, h = quantValue_col)
@@ -227,19 +232,18 @@ if(opt$pdf){
227
232
  }else{
228
233
  png(paste0(opt$output, '_heatmap.png'), width = 1000, height = 1000, units = "px", res=175, pointsize = 8)
229
234
  }
230
- if(opt$same_sets){
231
- group_colours <- colorRampPalette(brewer.pal(8, "Set1"))(length(unique(groups)))
232
- # print("cluster number:")
233
- # print(length(unique(groups)))
234
- # print(length(unique(group_colours)))
235
- group_colours_arranged <- group_colours[groups]
236
- heatmap.2(data, Rowv=as.dendrogram(hr), Colv=as.dendrogram(hr), trace="none", col=brewer.pal(11,"RdBu"), dendrogram = c("row"), labRow = FALSE, labCol = FALSE,
237
- xlab = opt$collabel, ylab = opt$rowlabel, RowSideColors=group_colours_arranged)
238
- }else{
239
- heatmap.2(data, Rowv=as.dendrogram(hr_row), Colv=as.dendrogram(hr_col), trace="none", col=brewer.pal(11,"RdBu"), labRow = FALSE, labCol = FALSE,
240
- xlab = opt$collabel, ylab = opt$rowlabel)
235
+ if(opt$same_sets){
236
+ group_colours <- colorRampPalette(brewer.pal(8, "Set1"))(length(unique(groups)))
237
+ group_colours_arranged <- c(rep('#000000', length(groups[groups == 0])), group_colours[groups])
238
+ heatmap.2(data, Rowv=as.dendrogram(hr), Colv=as.dendrogram(hr), trace="none", col=brewer.pal(11,"RdBu"), dendrogram = c("row"), labRow = FALSE, labCol = FALSE,
239
+ xlab = opt$collabel, ylab = opt$rowlabel, RowSideColors=group_colours_arranged)
240
+ }else{
241
+ group_colours <- colorRampPalette(brewer.pal(8, "Set1"))(length(unique(groups_row)))
242
+ group_colours_arranged <- c(rep('#000000', length(groups_row[groups_row == 0])), group_colours[groups_row])
243
+ heatmap.2(data, Rowv=as.dendrogram(hr_row), Colv=as.dendrogram(hr_col), trace="none", col=brewer.pal(11,"RdBu"), dendrogram = c("row"), labRow = FALSE, labCol = FALSE,
244
+ xlab = opt$collabel, ylab = opt$rowlabel, RowSideColors=group_colours_arranged)
241
245
 
242
- }
246
+ }
243
247
  dev.off()
244
248
  # save.image("test.RData")
245
249