pets 0.2.3 → 0.2.5

Sign up to get free protection for your applications and to get access to all the features.
Files changed (42) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile +2 -0
  3. data/README.md +79 -5
  4. data/bin/coPatReporter.rb +68 -156
  5. data/bin/comPatMondo.rb +1 -4
  6. data/bin/evidence_profiler.rb +102 -150
  7. data/bin/get_gen_features.rb +146 -0
  8. data/bin/get_network_nodes.rb +79 -132
  9. data/bin/get_sorted_profs.rb +25 -36
  10. data/bin/install_deps.rb +8 -0
  11. data/bin/paco_translator.rb +29 -72
  12. data/bin/phen2reg.rb +1 -4
  13. data/bin/profiles2phenopacket.rb +86 -0
  14. data/bin/reg2phen.rb +1 -3
  15. data/example_datasets/associations_file.txt +757 -0
  16. data/example_datasets/example_patient.txt +6 -0
  17. data/example_datasets/example_patient_hpos.txt +15 -0
  18. data/example_datasets/genes.txt +8 -0
  19. data/example_datasets/hpo2ci.txt +2798 -0
  20. data/example_datasets/hummu_congenital_full_dataset.txt +4183 -0
  21. data/example_datasets/launch.sh +20 -0
  22. data/external_code/generate_boxpot.R +51 -21
  23. data/external_code/get_clusters.R +2 -2
  24. data/external_code/install_R_dependencies.R +16 -0
  25. data/external_code/plot_heatmap.R +34 -30
  26. data/lib/pets/coPatReporterMethods.rb +172 -424
  27. data/lib/pets/cohort.rb +309 -0
  28. data/lib/pets/common_optparse.rb +30 -0
  29. data/lib/pets/constants.rb +8 -0
  30. data/lib/pets/generalMethods.rb +29 -319
  31. data/lib/pets/genomic_features.rb +240 -0
  32. data/lib/pets/io.rb +481 -0
  33. data/lib/pets/parsers/cohort_parser.rb +111 -0
  34. data/lib/pets/parsers/reference_parser.rb +39 -0
  35. data/lib/pets/version.rb +1 -1
  36. data/lib/pets.rb +9 -0
  37. data/pets.gemspec +7 -3
  38. data/templates/cluster_report.erb +25 -5
  39. data/templates/cohort_report.erb +5 -7
  40. data/templates/evidence_profile.erb +20 -4
  41. data/templates/patient_report.erb +1 -1
  42. metadata +96 -5
@@ -0,0 +1,20 @@
1
+ #! /usr/bin/env bash
2
+
3
+ current=`pwd`
4
+ hpo_file=$current/../external_data/hp.obo
5
+
6
+ #Launch Cohort Analyzer
7
+ mkdir -p $current/cohort_analyzer_results
8
+ cd $current/cohort_analyzer_results
9
+ coPatReporter.rb -i $current/hummu_congenital_full_dataset.txt -o $current/cohort_analyzer_results -p phenotypes -c chr -d patient_id -s start -e stop -m lin
10
+ cd..
11
+ # Launch Reg2Phen
12
+ mkdir -p $current/reg2phen_results
13
+ cd $current/reg2phen_results
14
+ reg2phen.rb -t $current/associations_file.txt -p $current/genes.txt -b $hpo_file -P -g -H -o $current/results/patient1Genes.txt -F $current/reg2phen_results/patient1Genes.html
15
+ cd ..
16
+ # Launch Phen2Reg
17
+ mkdir -p $current/phen2reg_results
18
+ cd $current/phen2reg_results
19
+ phen2reg.rb -t $current/associations_file.txt -M 50 -p $current/example_patient_hpos.txt -k -y 0 -d prednum -i $current/hpo2ci.txt -r fisher -f $hpo_file -P 0.05 -b 1 -m -T -Q > $current/phen2reg_results/single_phens.txt
20
+ cd ..
@@ -1,18 +1,37 @@
1
1
  #!/usr/bin/env Rscript
2
2
  suppressMessages(library(dplyr))
3
3
 
4
- load_file <- function(file_path, cluster_sim_out = NULL){
4
+ #####################
5
+ ## FUNCTIONS
6
+ #####################
7
+
8
+ load_file <- function(file_path, cluster_sim_out = NULL, sim_method = 'lin'){
5
9
  # sim_matrix <- read.table(file = file.path(file_path), sep = "\t", stringsAsFactors = FALSE, header = FALSE)
6
- sim_matrix <- RcppCNPy::npyLoad(file.path(file_path, "similarity_matrix_lin.npy"))
7
- axis_labels <- read.table(file.path(file_path, "similarity_matrix_lin.lst"), header=FALSE, stringsAsFactors=FALSE)
8
- colnames(sim_matrix) <- axis_labels$V1
9
- rownames(sim_matrix) <- axis_labels$V1
10
- diag(sim_matrix) <- NA
10
+ file_name <- file.path(file_path,paste0('similarity_matrix_',sim_method,'.npy'))
11
+ sim_matrix <- RcppCNPy::npyLoad(file_name)
12
+ file_name <- file.path(file_path,paste0('similarity_matrix_',sim_method,'.lst'))
13
+ if(file.exists(file_name)){ # squared matrix
11
14
 
12
- groups <- read.table(file.path(file_path, "lin_clusters.txt"), header=FALSE)
15
+ axis_labels <- read.table(file_name, header=FALSE, stringsAsFactors=FALSE, sep="\t")
16
+ colnames(sim_matrix) <- axis_labels$V1
17
+ rownames(sim_matrix) <- axis_labels$V1
18
+ diag(sim_matrix) <- NA
19
+ file_name <- paste0(sim_method,'_clusters.txt')
20
+ split_mode = "byboth"
21
+
22
+ }else{ # rectangular matrix
23
+ axis_labels_x <- read.table(file.path(file_path, paste0('similarity_matrix_',sim_method,'_x.lst')), header=FALSE, stringsAsFactors=FALSE, sep="\t")
24
+ axis_labels_y <- read.table(file.path(file_path, paste0('similarity_matrix_',sim_method,'_y.lst')), header=FALSE, stringsAsFactors=FALSE, sep="\t")
25
+ colnames(sim_matrix) <- axis_labels_x$V1
26
+ rownames(sim_matrix) <- axis_labels_y$V1
27
+ file_name <- paste0(sim_method,'_clusters_rows.txt')
28
+ split_mode = "byrows"
29
+ }
30
+
31
+ groups <- read.table(file.path(file_path, file_name), header=FALSE, sep="\t")
13
32
  groups_vec <- groups[,2]
14
33
  names(groups_vec) <- groups[,1]
15
- sim_within_groups <- calc_sim_within_groups(sim_matrix, groups_vec)
34
+ sim_within_groups <- calc_sim_within_groups(sim_matrix, groups_vec, split_mode = split_mode)
16
35
  if (!is.null(cluster_sim_out))
17
36
  write.table(sim_within_groups, cluster_sim_out, quote=FALSE, row.names=TRUE, sep="\t", col.names = FALSE)
18
37
  sim_matrix <- sim_matrix %>% as.data.frame %>% tibble::rownames_to_column() %>%
@@ -26,24 +45,34 @@ load_file <- function(file_path, cluster_sim_out = NULL){
26
45
  }
27
46
 
28
47
 
29
- get_group_submatrix_mean <- function(group, matrix_transf, groups=groups) {
30
- mean(matrix_transf[
31
- names(groups)[groups %in% group],
32
- names(groups)[groups %in% group]
33
- ], na.rm=TRUE
34
- )
48
+ get_group_submatrix_mean <- function(group, matrix_transf, groups=groups, split_mode = "byboth") {
49
+ submatrix <- matrix_transf
50
+ if (split_mode %in% c("byboth", "bycols")){
51
+ submatrix <- submatrix[,names(groups)[groups %in% group]]
52
+ }
53
+
54
+ if (split_mode %in% c("byboth", "byrows")){
55
+ submatrix <- submatrix[names(groups)[groups %in% group],]
56
+ }
57
+ mean(submatrix, na.rm=TRUE)
35
58
  }
36
59
 
37
- calc_sim_within_groups <- function(matrix_transf, groups) {
60
+ calc_sim_within_groups <- function(matrix_transf, groups, split_mode = "byboth") {
38
61
  unique_groups <- unique(groups)
39
- group_mean_sim <- sapply(unique_groups, get_group_submatrix_mean, matrix_transf=matrix_transf, groups=groups)
62
+ group_mean_sim <- sapply(unique_groups, get_group_submatrix_mean, matrix_transf=matrix_transf, groups=groups, split_mode = split_mode)
40
63
  names(group_mean_sim) <- unique_groups
41
64
  group_mean_sim
42
65
  }
43
66
 
67
+ #####################
68
+ ## OPTPARSE
69
+ #####################
70
+
44
71
  option_list <- list(
45
72
  optparse::make_option(c("-i", "--input_paths"), type="character", default=NULL,
46
- help="Path to Npy and names."),
73
+ help="Path to Npy and names"),
74
+ optparse::make_option(c("-m", "--sim_method"), type="character", default='lin',
75
+ help="Similarity method"),
47
76
  optparse::make_option(c("-o", "--output_file"), type="character", default=NULL,
48
77
  help="Output graph file name"),
49
78
  optparse::make_option(c("-t", "--tags"), type="character", default=NULL,
@@ -51,6 +80,9 @@ option_list <- list(
51
80
  )
52
81
  opt <- optparse::parse_args(optparse::OptionParser(option_list=option_list))
53
82
 
83
+ #####################
84
+ ## MAIN
85
+ #####################
54
86
 
55
87
  all_files <- unlist(strsplit(opt$input_paths, ","))
56
88
  tags <- seq(length(all_files))
@@ -60,7 +92,7 @@ if (!is.null(opt$tags)){
60
92
 
61
93
  similarity_dist <- list()
62
94
  for (file_i in seq(length(all_files))) {
63
- similarity_dist[[tags[file_i]]] <- load_file(all_files[file_i], cluster_sim_out = paste0(opt$output_file,"_", tags[file_i],"_cluster_sim"))
95
+ similarity_dist[[tags[file_i]]] <- load_file(all_files[file_i], cluster_sim_out = paste0(opt$output_file,"_", tags[file_i],"_cluster_sim"), sim_method = opt$sim_method)
64
96
  }
65
97
  similarity_dist[["enod"]] <- NULL
66
98
  for (tag in names(similarity_dist)){
@@ -83,8 +115,6 @@ pp <- ggplot2::ggplot(similarity_dist, ggplot2::aes(x = Cohort, y = Similarity,
83
115
  legend.position = "top",
84
116
  legend.title = ggplot2::element_text(size = 14),
85
117
  legend.text = ggplot2::element_text(size = 14)) +
86
- ggplot2::labs(fill = "Lin similarity")
118
+ ggplot2::labs(fill = paste0(opt$sim_method, " similarity"))
87
119
 
88
120
  ggplot2::ggsave(filename = paste0(opt$output_file,".png"),pp,width = 20, height = 18, dpi = 200, units = "cm", device='png')
89
-
90
-
@@ -16,8 +16,8 @@ option_list <- list(
16
16
  opt <- parse_args(OptionParser(option_list=option_list))
17
17
 
18
18
  if(!is.null(opt$npy)){
19
- x_axis_labels <- read.table(paste0(opt$npy, '_x.lst'), header=FALSE, stringsAsFactors=FALSE)
20
- y_axis_labels <- read.table(paste0(opt$npy, '_y.lst'), header=FALSE, stringsAsFactors=FALSE)
19
+ x_axis_labels <- read.table(paste0(opt$npy, '_x.lst'), header=FALSE, stringsAsFactors=FALSE, sep="\t")
20
+ y_axis_labels <- read.table(paste0(opt$npy, '_y.lst'), header=FALSE, stringsAsFactors=FALSE, sep="\t")
21
21
  matrix_data <- npyLoad(opt$data_file)
22
22
  colnames(matrix_data) <- x_axis_labels$V1
23
23
  rownames(matrix_data) <- y_axis_labels$V1
@@ -0,0 +1,16 @@
1
+ #! /usr/bin/env Rscript
2
+
3
+ ######################################################################################################
4
+ ####################### LIBRARY INSTALLING SCRIPT for PETS #################################
5
+ ######################################################################################################
6
+ print("Installing libraries from CRAN")
7
+ packages_list <-c("optparse","RcppCNPy","ggplot2","fastcluster","dplyr","gplots","RColorBrewer","tidyr","data.table","gridExtra", "dynamicTreeCut", "ggExtra", "ontologyIndex", "magrittr")
8
+ installed <- library()$results[,1]
9
+ packages_list <- setdiff(packages_list, installed)
10
+ if(length(packages_list) == 0){
11
+ print('All needed packages are installed')
12
+ }else{
13
+ install.packages(packages_list, repos='https://cloud.r-project.org')
14
+ }
15
+
16
+
@@ -141,11 +141,10 @@ if(opt$pairs){ # Load pairs
141
141
  data_raw$SetB <- as.character(data_raw$SetB)
142
142
  if(opt$same_sets){
143
143
  all_elements <- sort(unique(unlist(data_raw[,c("SetA","SetB")])))
144
- data <- matrix(0, length(all_elements), length(all_elements), dimnames = list(all_elements, all_elements))
145
- str(data_raw)
146
- str(data)
144
+ data <- matrix(NA, length(all_elements), length(all_elements), dimnames = list(all_elements, all_elements))
147
145
  data[as.matrix(data_raw[,c("SetA","SetB")])] <- data_raw$Value
148
146
  data[as.matrix(data_raw[,c("SetB","SetA")])] <- data_raw$Value
147
+ # save(data, file = "test2.RData")
149
148
  } else {
150
149
  rowSet <- unique(data_raw$SetA)
151
150
  colSet <- unique(data_raw$SetB)
@@ -154,10 +153,16 @@ if(opt$pairs){ # Load pairs
154
153
  }
155
154
  }else{ # Load matrix
156
155
  if(!is.null(opt$npy)){
157
- axis_labels <- read.table(opt$npy, header=FALSE, stringsAsFactors=FALSE)
158
156
  data <- npyLoad(opt$data_file)
159
- colnames(data) <- axis_labels$V1
157
+ axis_files <- unlist(strsplit(opt$npy, ','))
158
+ axis_labels <- read.table(axis_files[1], header=FALSE, stringsAsFactors=FALSE, sep="\t")
160
159
  rownames(data) <- axis_labels$V1
160
+ if(length(axis_files) == 2){
161
+ x_axis_labels <- read.table(axis_files[2], header=FALSE, stringsAsFactors=FALSE, sep="\t")
162
+ colnames(data) <- x_axis_labels$V1
163
+ }else{
164
+ colnames(data) <- axis_labels$V1
165
+ }
161
166
  }else{
162
167
  data <- as.matrix(read.table(opt$data_file, sep="\t", header=opt$header, stringsAsFactors=FALSE, row.names= 1, check.names = FALSE))
163
168
  }
@@ -187,32 +192,32 @@ if(opt$same_sets){
187
192
  hr <- fastcluster::hclust(as.dist(matrix_transf), method="ward.D2")
188
193
  groups <- cluster_obj_to_groups(matrix_transf, hr, opt$tree_cut_method, minProportionCluster = opt$minProportionCluster)
189
194
 
190
- sim_between_groups <- calc_sim_between_groups(data, groups)
191
- distance_between_groups <- 1 - sim_between_groups
192
- groups_clustered <- fastcluster::hclust(as.dist(distance_between_groups), method="ward.D2")
193
-
194
- # Plot dendrogram to check performance
195
- # png(file=file.path(opt$output, 'dendrogram_groups.png', sep=''))
196
- # plot(dendrogram_groups)
197
- # dev.off()
198
- ######### EXPORT
199
195
  write.table(groups, file=paste0(opt$output, '_clusters.txt'), sep="\t", quote=FALSE, col.names=FALSE, row.names= TRUE)
200
196
  if (opt$save_raw_clust){
201
197
  dendrogram_groups <- as.dendrogram(hr)
202
- } else {
198
+ } else { # TODO Pedro: I have no idea about the reason for the following code
199
+ sim_between_groups <- calc_sim_between_groups(data, groups)
200
+ distance_between_groups <- 1 - sim_between_groups
201
+ groups_clustered <- fastcluster::hclust(as.dist(distance_between_groups), method="ward.D2")
203
202
  dendrogram_groups <- as.dendrogram(groups_clustered)
204
203
  }
204
+ # Plot dendrogram to check performance
205
+ # png(file=file.path(opt$output, 'dendrogram_groups.png', sep=''))
206
+ # plot(dendrogram_groups)
207
+ # dev.off()
205
208
  save(dendrogram_groups, file=paste0(opt$output, '_dendrogram_groups.RData', sep=''))
206
209
 
207
210
  }else{
208
211
  # Calc similitudes of rows
209
212
  mdistRows = toDistances(matrix_transf)
210
213
  mdistCols = toDistances(matrix_transf, FALSE)
214
+ rownames(mdistRows) <- rownames(matrix_transf) # The square matrixes obtained have lost
215
+ colnames(mdistRows) <- rownames(matrix_transf) # row and col names. We retrieve them
216
+ rownames(mdistCols) <- colnames(matrix_transf) # from original matrix. In previous case,
217
+ colnames(mdistCols) <- colnames(matrix_transf) # the matrix_transf is used directly by hclust
211
218
  # Obtaing clustering
212
- quantValue_row <- quantile(mdistRows, c(.2), na.rm = TRUE)
213
219
  hr_row <- fastcluster::hclust(as.dist(mdistRows), method="ward.D2")
214
- groups_row <- cluster_obj_to_groups(mdistRows, hr, opt$tree_cut_method)
215
-
220
+ groups_row <- cluster_obj_to_groups(mdistRows, hr_row, opt$tree_cut_method, minProportionCluster = opt$minProportionCluster)
216
221
  quantValue_col <- quantile(mdistCols, c(.2), na.rm = TRUE)
217
222
  hr_col <- fastcluster::hclust(as.dist(mdistCols), method="ward.D2")
218
223
  groups_col <- cutree(hr_col, h = quantValue_col)
@@ -227,19 +232,18 @@ if(opt$pdf){
227
232
  }else{
228
233
  png(paste0(opt$output, '_heatmap.png'), width = 1000, height = 1000, units = "px", res=175, pointsize = 8)
229
234
  }
230
- if(opt$same_sets){
231
- group_colours <- colorRampPalette(brewer.pal(8, "Set1"))(length(unique(groups)))
232
- # print("cluster number:")
233
- # print(length(unique(groups)))
234
- # print(length(unique(group_colours)))
235
- group_colours_arranged <- group_colours[groups]
236
- heatmap.2(data, Rowv=as.dendrogram(hr), Colv=as.dendrogram(hr), trace="none", col=brewer.pal(11,"RdBu"), dendrogram = c("row"), labRow = FALSE, labCol = FALSE,
237
- xlab = opt$collabel, ylab = opt$rowlabel, RowSideColors=group_colours_arranged)
238
- }else{
239
- heatmap.2(data, Rowv=as.dendrogram(hr_row), Colv=as.dendrogram(hr_col), trace="none", col=brewer.pal(11,"RdBu"), labRow = FALSE, labCol = FALSE,
240
- xlab = opt$collabel, ylab = opt$rowlabel)
235
+ if(opt$same_sets){
236
+ group_colours <- colorRampPalette(brewer.pal(8, "Set1"))(length(unique(groups)))
237
+ group_colours_arranged <- c(rep('#000000', length(groups[groups == 0])), group_colours[groups])
238
+ heatmap.2(data, Rowv=as.dendrogram(hr), Colv=as.dendrogram(hr), trace="none", col=brewer.pal(11,"RdBu"), dendrogram = c("row"), labRow = FALSE, labCol = FALSE,
239
+ xlab = opt$collabel, ylab = opt$rowlabel, RowSideColors=group_colours_arranged)
240
+ }else{
241
+ group_colours <- colorRampPalette(brewer.pal(8, "Set1"))(length(unique(groups_row)))
242
+ group_colours_arranged <- c(rep('#000000', length(groups_row[groups_row == 0])), group_colours[groups_row])
243
+ heatmap.2(data, Rowv=as.dendrogram(hr_row), Colv=as.dendrogram(hr_col), trace="none", col=brewer.pal(11,"RdBu"), dendrogram = c("row"), labRow = FALSE, labCol = FALSE,
244
+ xlab = opt$collabel, ylab = opt$rowlabel, RowSideColors=group_colours_arranged)
241
245
 
242
- }
246
+ }
243
247
  dev.off()
244
248
  # save.image("test.RData")
245
249