pets 0.2.3 → 0.2.4
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Gemfile +2 -0
- data/README.md +79 -5
- data/bin/coPatReporter.rb +63 -156
- data/bin/comPatMondo.rb +1 -4
- data/bin/evidence_profiler.rb +38 -151
- data/bin/get_network_nodes.rb +79 -132
- data/bin/get_sorted_profs.rb +25 -36
- data/bin/install_deps.rb +7 -0
- data/bin/paco_translator.rb +29 -72
- data/bin/phen2reg.rb +1 -4
- data/bin/profiles2phenopacket.rb +110 -0
- data/bin/reg2phen.rb +1 -3
- data/example_datasets/associations_file.txt +757 -0
- data/example_datasets/example_patient.txt +6 -0
- data/example_datasets/example_patient_hpos.txt +15 -0
- data/example_datasets/genes.txt +8 -0
- data/example_datasets/hpo2ci.txt +2798 -0
- data/example_datasets/hummu_congenital_full_dataset.txt +4183 -0
- data/example_datasets/launch.sh +20 -0
- data/external_code/generate_boxpot.R +51 -21
- data/external_code/get_clusters.R +2 -2
- data/external_code/install_R_dependencies.R +11 -0
- data/external_code/plot_heatmap.R +34 -30
- data/lib/pets/coPatReporterMethods.rb +143 -441
- data/lib/pets/cohort.rb +307 -0
- data/lib/pets/constants.rb +7 -0
- data/lib/pets/generalMethods.rb +8 -317
- data/lib/pets/genomic_features.rb +144 -0
- data/lib/pets/io.rb +457 -0
- data/lib/pets/parsers/cohort_parser.rb +106 -0
- data/lib/pets/version.rb +1 -1
- data/lib/pets.rb +8 -0
- data/pets.gemspec +1 -0
- data/templates/cohort_report.erb +5 -7
- data/templates/patient_report.erb +1 -1
- metadata +34 -3
@@ -0,0 +1,20 @@
|
|
1
|
+
#! /usr/bin/env bash
|
2
|
+
|
3
|
+
current=`pwd`
|
4
|
+
hpo_file=$current/../external_data/hp.obo
|
5
|
+
|
6
|
+
#Launch Cohort Analyzer
|
7
|
+
mkdir -p $current/cohort_analyzer_results
|
8
|
+
cd $current/cohort_analyzer_results
|
9
|
+
coPatReporter.rb -i $current/hummu_congenital_full_dataset.txt -o $current/cohort_analyzer_results -p phenotypes -c chr -d patient_id -s start -e stop -m lin
|
10
|
+
cd..
|
11
|
+
# Launch Reg2Phen
|
12
|
+
mkdir -p $current/reg2phen_results
|
13
|
+
cd $current/reg2phen_results
|
14
|
+
reg2phen.rb -t $current/associations_file.txt -p $current/genes.txt -b $hpo_file -P -g -H -o $current/results/patient1Genes.txt -F $current/reg2phen_results/patient1Genes.html
|
15
|
+
cd ..
|
16
|
+
# Launch Phen2Reg
|
17
|
+
mkdir -p $current/phen2reg_results
|
18
|
+
cd $current/phen2reg_results
|
19
|
+
phen2reg.rb -t $current/associations_file.txt -M 50 -p $current/example_patient_hpos.txt -k -y 0 -d prednum -i $current/hpo2ci.txt -r fisher -f $hpo_file -P 0.05 -b 1 -m -T -Q > $current/phen2reg_results/single_phens.txt
|
20
|
+
cd ..
|
@@ -1,18 +1,37 @@
|
|
1
1
|
#!/usr/bin/env Rscript
|
2
2
|
suppressMessages(library(dplyr))
|
3
3
|
|
4
|
-
|
4
|
+
#####################
|
5
|
+
## FUNCTIONS
|
6
|
+
#####################
|
7
|
+
|
8
|
+
load_file <- function(file_path, cluster_sim_out = NULL, sim_method = 'lin'){
|
5
9
|
# sim_matrix <- read.table(file = file.path(file_path), sep = "\t", stringsAsFactors = FALSE, header = FALSE)
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
diag(sim_matrix) <- NA
|
10
|
+
file_name <- file.path(file_path,paste0('similarity_matrix_',sim_method,'.npy'))
|
11
|
+
sim_matrix <- RcppCNPy::npyLoad(file_name)
|
12
|
+
file_name <- file.path(file_path,paste0('similarity_matrix_',sim_method,'.lst'))
|
13
|
+
if(file.exists(file_name)){ # squared matrix
|
11
14
|
|
12
|
-
|
15
|
+
axis_labels <- read.table(file_name, header=FALSE, stringsAsFactors=FALSE, sep="\t")
|
16
|
+
colnames(sim_matrix) <- axis_labels$V1
|
17
|
+
rownames(sim_matrix) <- axis_labels$V1
|
18
|
+
diag(sim_matrix) <- NA
|
19
|
+
file_name <- paste0(sim_method,'_clusters.txt')
|
20
|
+
split_mode = "byboth"
|
21
|
+
|
22
|
+
}else{ # rectangular matrix
|
23
|
+
axis_labels_x <- read.table(file.path(file_path, paste0('similarity_matrix_',sim_method,'_x.lst')), header=FALSE, stringsAsFactors=FALSE, sep="\t")
|
24
|
+
axis_labels_y <- read.table(file.path(file_path, paste0('similarity_matrix_',sim_method,'_y.lst')), header=FALSE, stringsAsFactors=FALSE, sep="\t")
|
25
|
+
colnames(sim_matrix) <- axis_labels_x$V1
|
26
|
+
rownames(sim_matrix) <- axis_labels_y$V1
|
27
|
+
file_name <- paste0(sim_method,'_clusters_rows.txt')
|
28
|
+
split_mode = "byrows"
|
29
|
+
}
|
30
|
+
|
31
|
+
groups <- read.table(file.path(file_path, file_name), header=FALSE, sep="\t")
|
13
32
|
groups_vec <- groups[,2]
|
14
33
|
names(groups_vec) <- groups[,1]
|
15
|
-
sim_within_groups <- calc_sim_within_groups(sim_matrix, groups_vec)
|
34
|
+
sim_within_groups <- calc_sim_within_groups(sim_matrix, groups_vec, split_mode = split_mode)
|
16
35
|
if (!is.null(cluster_sim_out))
|
17
36
|
write.table(sim_within_groups, cluster_sim_out, quote=FALSE, row.names=TRUE, sep="\t", col.names = FALSE)
|
18
37
|
sim_matrix <- sim_matrix %>% as.data.frame %>% tibble::rownames_to_column() %>%
|
@@ -26,24 +45,34 @@ load_file <- function(file_path, cluster_sim_out = NULL){
|
|
26
45
|
}
|
27
46
|
|
28
47
|
|
29
|
-
get_group_submatrix_mean <- function(group, matrix_transf, groups=groups) {
|
30
|
-
|
31
|
-
|
32
|
-
names(groups)[groups %in% group]
|
33
|
-
|
34
|
-
|
48
|
+
get_group_submatrix_mean <- function(group, matrix_transf, groups=groups, split_mode = "byboth") {
|
49
|
+
submatrix <- matrix_transf
|
50
|
+
if (split_mode %in% c("byboth", "bycols")){
|
51
|
+
submatrix <- submatrix[,names(groups)[groups %in% group]]
|
52
|
+
}
|
53
|
+
|
54
|
+
if (split_mode %in% c("byboth", "byrows")){
|
55
|
+
submatrix <- submatrix[names(groups)[groups %in% group],]
|
56
|
+
}
|
57
|
+
mean(submatrix, na.rm=TRUE)
|
35
58
|
}
|
36
59
|
|
37
|
-
calc_sim_within_groups <- function(matrix_transf, groups) {
|
60
|
+
calc_sim_within_groups <- function(matrix_transf, groups, split_mode = "byboth") {
|
38
61
|
unique_groups <- unique(groups)
|
39
|
-
group_mean_sim <- sapply(unique_groups, get_group_submatrix_mean, matrix_transf=matrix_transf, groups=groups)
|
62
|
+
group_mean_sim <- sapply(unique_groups, get_group_submatrix_mean, matrix_transf=matrix_transf, groups=groups, split_mode = split_mode)
|
40
63
|
names(group_mean_sim) <- unique_groups
|
41
64
|
group_mean_sim
|
42
65
|
}
|
43
66
|
|
67
|
+
#####################
|
68
|
+
## OPTPARSE
|
69
|
+
#####################
|
70
|
+
|
44
71
|
option_list <- list(
|
45
72
|
optparse::make_option(c("-i", "--input_paths"), type="character", default=NULL,
|
46
|
-
help="Path to Npy and names
|
73
|
+
help="Path to Npy and names"),
|
74
|
+
optparse::make_option(c("-m", "--sim_method"), type="character", default='lin',
|
75
|
+
help="Similarity method"),
|
47
76
|
optparse::make_option(c("-o", "--output_file"), type="character", default=NULL,
|
48
77
|
help="Output graph file name"),
|
49
78
|
optparse::make_option(c("-t", "--tags"), type="character", default=NULL,
|
@@ -51,6 +80,9 @@ option_list <- list(
|
|
51
80
|
)
|
52
81
|
opt <- optparse::parse_args(optparse::OptionParser(option_list=option_list))
|
53
82
|
|
83
|
+
#####################
|
84
|
+
## MAIN
|
85
|
+
#####################
|
54
86
|
|
55
87
|
all_files <- unlist(strsplit(opt$input_paths, ","))
|
56
88
|
tags <- seq(length(all_files))
|
@@ -60,7 +92,7 @@ if (!is.null(opt$tags)){
|
|
60
92
|
|
61
93
|
similarity_dist <- list()
|
62
94
|
for (file_i in seq(length(all_files))) {
|
63
|
-
similarity_dist[[tags[file_i]]] <- load_file(all_files[file_i], cluster_sim_out = paste0(opt$output_file,"_", tags[file_i],"_cluster_sim"))
|
95
|
+
similarity_dist[[tags[file_i]]] <- load_file(all_files[file_i], cluster_sim_out = paste0(opt$output_file,"_", tags[file_i],"_cluster_sim"), sim_method = opt$sim_method)
|
64
96
|
}
|
65
97
|
similarity_dist[["enod"]] <- NULL
|
66
98
|
for (tag in names(similarity_dist)){
|
@@ -83,8 +115,6 @@ pp <- ggplot2::ggplot(similarity_dist, ggplot2::aes(x = Cohort, y = Similarity,
|
|
83
115
|
legend.position = "top",
|
84
116
|
legend.title = ggplot2::element_text(size = 14),
|
85
117
|
legend.text = ggplot2::element_text(size = 14)) +
|
86
|
-
ggplot2::labs(fill = "
|
118
|
+
ggplot2::labs(fill = paste0(opt$sim_method, " similarity"))
|
87
119
|
|
88
120
|
ggplot2::ggsave(filename = paste0(opt$output_file,".png"),pp,width = 20, height = 18, dpi = 200, units = "cm", device='png')
|
89
|
-
|
90
|
-
|
@@ -16,8 +16,8 @@ option_list <- list(
|
|
16
16
|
opt <- parse_args(OptionParser(option_list=option_list))
|
17
17
|
|
18
18
|
if(!is.null(opt$npy)){
|
19
|
-
x_axis_labels <- read.table(paste0(opt$npy, '_x.lst'), header=FALSE, stringsAsFactors=FALSE)
|
20
|
-
y_axis_labels <- read.table(paste0(opt$npy, '_y.lst'), header=FALSE, stringsAsFactors=FALSE)
|
19
|
+
x_axis_labels <- read.table(paste0(opt$npy, '_x.lst'), header=FALSE, stringsAsFactors=FALSE, sep="\t")
|
20
|
+
y_axis_labels <- read.table(paste0(opt$npy, '_y.lst'), header=FALSE, stringsAsFactors=FALSE, sep="\t")
|
21
21
|
matrix_data <- npyLoad(opt$data_file)
|
22
22
|
colnames(matrix_data) <- x_axis_labels$V1
|
23
23
|
rownames(matrix_data) <- y_axis_labels$V1
|
@@ -0,0 +1,11 @@
|
|
1
|
+
#! /usr/bin/env Rscript
|
2
|
+
|
3
|
+
######################################################################################################
|
4
|
+
####################### LIBRARY INSTALLING SCRIPT for PETS #################################
|
5
|
+
######################################################################################################
|
6
|
+
print("Installing libraries from CRAN")
|
7
|
+
packages_list <-c("optparse","RcppCNPy","ggplot2","fastcluster","dplyr","gplots","RColorBrewer","tidyr","data.table","gridExtra", "dynamicTreeCut", "ggExtra", "ontologyIndex", "magrittr")
|
8
|
+
installed <- library()$results[,1]
|
9
|
+
packages_list <- setdiff(packages_list, installed)
|
10
|
+
install.packages(packages_list, repos='https://cloud.r-project.org')
|
11
|
+
|
@@ -141,11 +141,10 @@ if(opt$pairs){ # Load pairs
|
|
141
141
|
data_raw$SetB <- as.character(data_raw$SetB)
|
142
142
|
if(opt$same_sets){
|
143
143
|
all_elements <- sort(unique(unlist(data_raw[,c("SetA","SetB")])))
|
144
|
-
data <- matrix(
|
145
|
-
str(data_raw)
|
146
|
-
str(data)
|
144
|
+
data <- matrix(NA, length(all_elements), length(all_elements), dimnames = list(all_elements, all_elements))
|
147
145
|
data[as.matrix(data_raw[,c("SetA","SetB")])] <- data_raw$Value
|
148
146
|
data[as.matrix(data_raw[,c("SetB","SetA")])] <- data_raw$Value
|
147
|
+
# save(data, file = "test2.RData")
|
149
148
|
} else {
|
150
149
|
rowSet <- unique(data_raw$SetA)
|
151
150
|
colSet <- unique(data_raw$SetB)
|
@@ -154,10 +153,16 @@ if(opt$pairs){ # Load pairs
|
|
154
153
|
}
|
155
154
|
}else{ # Load matrix
|
156
155
|
if(!is.null(opt$npy)){
|
157
|
-
axis_labels <- read.table(opt$npy, header=FALSE, stringsAsFactors=FALSE)
|
158
156
|
data <- npyLoad(opt$data_file)
|
159
|
-
|
157
|
+
axis_files <- unlist(strsplit(opt$npy, ','))
|
158
|
+
axis_labels <- read.table(axis_files[1], header=FALSE, stringsAsFactors=FALSE, sep="\t")
|
160
159
|
rownames(data) <- axis_labels$V1
|
160
|
+
if(length(axis_files) == 2){
|
161
|
+
x_axis_labels <- read.table(axis_files[2], header=FALSE, stringsAsFactors=FALSE, sep="\t")
|
162
|
+
colnames(data) <- x_axis_labels$V1
|
163
|
+
}else{
|
164
|
+
colnames(data) <- axis_labels$V1
|
165
|
+
}
|
161
166
|
}else{
|
162
167
|
data <- as.matrix(read.table(opt$data_file, sep="\t", header=opt$header, stringsAsFactors=FALSE, row.names= 1, check.names = FALSE))
|
163
168
|
}
|
@@ -187,32 +192,32 @@ if(opt$same_sets){
|
|
187
192
|
hr <- fastcluster::hclust(as.dist(matrix_transf), method="ward.D2")
|
188
193
|
groups <- cluster_obj_to_groups(matrix_transf, hr, opt$tree_cut_method, minProportionCluster = opt$minProportionCluster)
|
189
194
|
|
190
|
-
sim_between_groups <- calc_sim_between_groups(data, groups)
|
191
|
-
distance_between_groups <- 1 - sim_between_groups
|
192
|
-
groups_clustered <- fastcluster::hclust(as.dist(distance_between_groups), method="ward.D2")
|
193
|
-
|
194
|
-
# Plot dendrogram to check performance
|
195
|
-
# png(file=file.path(opt$output, 'dendrogram_groups.png', sep=''))
|
196
|
-
# plot(dendrogram_groups)
|
197
|
-
# dev.off()
|
198
|
-
######### EXPORT
|
199
195
|
write.table(groups, file=paste0(opt$output, '_clusters.txt'), sep="\t", quote=FALSE, col.names=FALSE, row.names= TRUE)
|
200
196
|
if (opt$save_raw_clust){
|
201
197
|
dendrogram_groups <- as.dendrogram(hr)
|
202
|
-
} else {
|
198
|
+
} else { # TODO Pedro: I have no idea about the reason for the following code
|
199
|
+
sim_between_groups <- calc_sim_between_groups(data, groups)
|
200
|
+
distance_between_groups <- 1 - sim_between_groups
|
201
|
+
groups_clustered <- fastcluster::hclust(as.dist(distance_between_groups), method="ward.D2")
|
203
202
|
dendrogram_groups <- as.dendrogram(groups_clustered)
|
204
203
|
}
|
204
|
+
# Plot dendrogram to check performance
|
205
|
+
# png(file=file.path(opt$output, 'dendrogram_groups.png', sep=''))
|
206
|
+
# plot(dendrogram_groups)
|
207
|
+
# dev.off()
|
205
208
|
save(dendrogram_groups, file=paste0(opt$output, '_dendrogram_groups.RData', sep=''))
|
206
209
|
|
207
210
|
}else{
|
208
211
|
# Calc similitudes of rows
|
209
212
|
mdistRows = toDistances(matrix_transf)
|
210
213
|
mdistCols = toDistances(matrix_transf, FALSE)
|
214
|
+
rownames(mdistRows) <- rownames(matrix_transf) # The square matrixes obtained have lost
|
215
|
+
colnames(mdistRows) <- rownames(matrix_transf) # row and col names. We retrieve them
|
216
|
+
rownames(mdistCols) <- colnames(matrix_transf) # from original matrix. In previous case,
|
217
|
+
colnames(mdistCols) <- colnames(matrix_transf) # the matrix_transf is used directly by hclust
|
211
218
|
# Obtaing clustering
|
212
|
-
quantValue_row <- quantile(mdistRows, c(.2), na.rm = TRUE)
|
213
219
|
hr_row <- fastcluster::hclust(as.dist(mdistRows), method="ward.D2")
|
214
|
-
groups_row <- cluster_obj_to_groups(mdistRows,
|
215
|
-
|
220
|
+
groups_row <- cluster_obj_to_groups(mdistRows, hr_row, opt$tree_cut_method, minProportionCluster = opt$minProportionCluster)
|
216
221
|
quantValue_col <- quantile(mdistCols, c(.2), na.rm = TRUE)
|
217
222
|
hr_col <- fastcluster::hclust(as.dist(mdistCols), method="ward.D2")
|
218
223
|
groups_col <- cutree(hr_col, h = quantValue_col)
|
@@ -227,19 +232,18 @@ if(opt$pdf){
|
|
227
232
|
}else{
|
228
233
|
png(paste0(opt$output, '_heatmap.png'), width = 1000, height = 1000, units = "px", res=175, pointsize = 8)
|
229
234
|
}
|
230
|
-
|
231
|
-
|
232
|
-
|
233
|
-
|
234
|
-
|
235
|
-
|
236
|
-
|
237
|
-
|
238
|
-
|
239
|
-
|
240
|
-
xlab = opt$collabel, ylab = opt$rowlabel)
|
235
|
+
if(opt$same_sets){
|
236
|
+
group_colours <- colorRampPalette(brewer.pal(8, "Set1"))(length(unique(groups)))
|
237
|
+
group_colours_arranged <- c(rep('#000000', length(groups[groups == 0])), group_colours[groups])
|
238
|
+
heatmap.2(data, Rowv=as.dendrogram(hr), Colv=as.dendrogram(hr), trace="none", col=brewer.pal(11,"RdBu"), dendrogram = c("row"), labRow = FALSE, labCol = FALSE,
|
239
|
+
xlab = opt$collabel, ylab = opt$rowlabel, RowSideColors=group_colours_arranged)
|
240
|
+
}else{
|
241
|
+
group_colours <- colorRampPalette(brewer.pal(8, "Set1"))(length(unique(groups_row)))
|
242
|
+
group_colours_arranged <- c(rep('#000000', length(groups_row[groups_row == 0])), group_colours[groups_row])
|
243
|
+
heatmap.2(data, Rowv=as.dendrogram(hr_row), Colv=as.dendrogram(hr_col), trace="none", col=brewer.pal(11,"RdBu"), dendrogram = c("row"), labRow = FALSE, labCol = FALSE,
|
244
|
+
xlab = opt$collabel, ylab = opt$rowlabel, RowSideColors=group_colours_arranged)
|
241
245
|
|
242
|
-
|
246
|
+
}
|
243
247
|
dev.off()
|
244
248
|
# save.image("test.RData")
|
245
249
|
|