biopipen 0.28.0__py3-none-any.whl → 0.29.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of biopipen might be problematic. Click here for more details.
- biopipen/__init__.py +1 -1
- biopipen/core/config.toml +8 -0
- biopipen/ns/bam.py +0 -2
- biopipen/ns/bed.py +35 -0
- biopipen/ns/cellranger_pipeline.py +5 -5
- biopipen/ns/cnv.py +18 -2
- biopipen/ns/cnvkit_pipeline.py +16 -11
- biopipen/ns/gene.py +68 -23
- biopipen/ns/misc.py +2 -15
- biopipen/ns/plot.py +146 -0
- biopipen/ns/regulation.py +214 -0
- biopipen/ns/scrna.py +15 -3
- biopipen/ns/snp.py +516 -8
- biopipen/ns/stats.py +74 -2
- biopipen/ns/vcf.py +196 -0
- biopipen/reports/snp/PlinkCallRate.svelte +24 -0
- biopipen/reports/snp/PlinkFreq.svelte +18 -0
- biopipen/reports/snp/PlinkHWE.svelte +18 -0
- biopipen/reports/snp/PlinkHet.svelte +18 -0
- biopipen/reports/snp/PlinkIBD.svelte +18 -0
- biopipen/scripts/bam/CNVpytor.py +144 -46
- biopipen/scripts/bed/BedtoolsIntersect.py +54 -0
- biopipen/scripts/bed/BedtoolsMerge.py +1 -1
- biopipen/scripts/cnv/AneuploidyScore.R +30 -7
- biopipen/scripts/cnv/AneuploidyScoreSummary.R +5 -2
- biopipen/scripts/cnv/TMADScore.R +21 -5
- biopipen/scripts/cnv/TMADScoreSummary.R +6 -2
- biopipen/scripts/cnvkit/CNVkitAccess.py +2 -1
- biopipen/scripts/cnvkit/CNVkitAutobin.py +3 -2
- biopipen/scripts/cnvkit/CNVkitBatch.py +1 -1
- biopipen/scripts/cnvkit/CNVkitCoverage.py +2 -1
- biopipen/scripts/cnvkit/CNVkitGuessBaits.py +1 -1
- biopipen/scripts/cnvkit/CNVkitHeatmap.py +1 -1
- biopipen/scripts/cnvkit/CNVkitReference.py +2 -1
- biopipen/scripts/gene/GeneNameConversion.R +65 -0
- biopipen/scripts/gene/GenePromoters.R +61 -0
- biopipen/scripts/misc/Shell.sh +15 -0
- biopipen/scripts/plot/Manhattan.R +140 -0
- biopipen/scripts/plot/QQPlot.R +62 -0
- biopipen/scripts/regulation/MotifAffinityTest.R +226 -0
- biopipen/scripts/regulation/MotifAffinityTest_AtSNP.R +126 -0
- biopipen/scripts/regulation/MotifAffinityTest_MotifBreakR.R +96 -0
- biopipen/scripts/regulation/MotifScan.py +159 -0
- biopipen/scripts/regulation/atSNP.R +33 -0
- biopipen/scripts/regulation/motifBreakR.R +1594 -0
- biopipen/scripts/scrna/CellsDistribution.R +2 -0
- biopipen/scripts/scrna/MarkersFinder.R +59 -67
- biopipen/scripts/scrna/SeuratClustering.R +63 -29
- biopipen/scripts/scrna/SeuratMap2Ref.R +20 -0
- biopipen/scripts/scrna/SeuratSubClustering.R +76 -27
- biopipen/scripts/snp/MatrixEQTL.R +84 -43
- biopipen/scripts/snp/Plink2GTMat.py +133 -0
- biopipen/scripts/snp/PlinkCallRate.R +190 -0
- biopipen/scripts/snp/PlinkFilter.py +100 -0
- biopipen/scripts/snp/PlinkFreq.R +298 -0
- biopipen/scripts/snp/PlinkFromVcf.py +78 -0
- biopipen/scripts/snp/PlinkHWE.R +80 -0
- biopipen/scripts/snp/PlinkHet.R +92 -0
- biopipen/scripts/snp/PlinkIBD.R +197 -0
- biopipen/scripts/snp/PlinkUpdateName.py +124 -0
- biopipen/scripts/stats/MetaPvalue.R +2 -1
- biopipen/scripts/stats/MetaPvalue1.R +70 -0
- biopipen/scripts/tcr/TCRClusterStats.R +12 -7
- biopipen/scripts/vcf/BcftoolsAnnotate.py +91 -0
- biopipen/scripts/vcf/BcftoolsFilter.py +90 -0
- biopipen/scripts/vcf/BcftoolsSort.py +113 -0
- biopipen/scripts/vcf/BcftoolsView.py +73 -0
- biopipen/scripts/vcf/VcfFix_utils.py +1 -1
- biopipen/scripts/vcf/bcftools_utils.py +52 -0
- biopipen/utils/gene.R +83 -37
- biopipen/utils/gene.py +108 -60
- biopipen/utils/misc.R +56 -0
- biopipen/utils/misc.py +5 -2
- biopipen/utils/reference.py +54 -10
- {biopipen-0.28.0.dist-info → biopipen-0.29.0.dist-info}/METADATA +2 -2
- {biopipen-0.28.0.dist-info → biopipen-0.29.0.dist-info}/RECORD +78 -50
- {biopipen-0.28.0.dist-info → biopipen-0.29.0.dist-info}/entry_points.txt +1 -1
- biopipen/ns/bcftools.py +0 -111
- biopipen/scripts/bcftools/BcftoolsAnnotate.py +0 -42
- biopipen/scripts/bcftools/BcftoolsFilter.py +0 -79
- biopipen/scripts/bcftools/BcftoolsSort.py +0 -19
- biopipen/scripts/gene/GeneNameConversion.py +0 -66
- {biopipen-0.28.0.dist-info → biopipen-0.29.0.dist-info}/WHEEL +0 -0
|
@@ -360,6 +360,7 @@ do_case <- function(name, case) {
|
|
|
360
360
|
}
|
|
361
361
|
|
|
362
362
|
log_info(" Merging and saving pie charts ...")
|
|
363
|
+
devpars = case$devpars
|
|
363
364
|
# assemble and save pie chart plots
|
|
364
365
|
res <- devpars$res %||% 100
|
|
365
366
|
# legend, cells_by names
|
|
@@ -405,6 +406,7 @@ do_case <- function(name, case) {
|
|
|
405
406
|
}
|
|
406
407
|
|
|
407
408
|
col_fun <- colorRamp2(c(0, max(hmdata, na.rm = T)), c("lightyellow", "purple"))
|
|
409
|
+
hm_devpars <- case$hm_devpars
|
|
408
410
|
hm_res <- hm_devpars$res %||% 100
|
|
409
411
|
hm_width <- hm_devpars$width %||% (600 + 15 * length(unique(meta$seurat_clusters)) + extra_width)
|
|
410
412
|
hm_height <- hm_devpars$height %||% (450 + 15 * cells_rows + extra_height)
|
|
@@ -411,45 +411,11 @@ do_case_findall <- function(casename) {
|
|
|
411
411
|
log_info(" Using cached markers ...")
|
|
412
412
|
markers <- cached$data
|
|
413
413
|
} else {
|
|
414
|
-
markers <-
|
|
415
|
-
do_call(FindAllMarkers, args)
|
|
416
|
-
# gene, p_val, avg_log2FC, pct.1, pct.2, p_val_adj, cluster
|
|
417
|
-
}, error = function(e) {
|
|
418
|
-
log_warn(e$message)
|
|
419
|
-
|
|
420
|
-
data.frame(
|
|
421
|
-
gene = character(),
|
|
422
|
-
p_val = numeric(),
|
|
423
|
-
avg_log2FC = numeric(),
|
|
424
|
-
pct.1 = numeric(),
|
|
425
|
-
pct.2 = numeric(),
|
|
426
|
-
p_val_adj=numeric(),
|
|
427
|
-
cluster = character()
|
|
428
|
-
)
|
|
429
|
-
})
|
|
414
|
+
markers <- find_markers(args, find_all = TRUE)
|
|
430
415
|
cached$data <- markers
|
|
431
416
|
save_to_cache(cached, "FindAllMarkers", cache)
|
|
432
417
|
}
|
|
433
418
|
|
|
434
|
-
if (nrow(markers) == 0 && defassay == "SCT") {
|
|
435
|
-
log_warn(" No markers found from SCT assay, try recorrect_umi = FALSE")
|
|
436
|
-
args$recorrect_umi <- FALSE
|
|
437
|
-
markers <- tryCatch({
|
|
438
|
-
do_call(FindAllMarkers, args)
|
|
439
|
-
}, error = function(e) {
|
|
440
|
-
log_warn(e$message)
|
|
441
|
-
data.frame(
|
|
442
|
-
gene = character(),
|
|
443
|
-
p_val = numeric(),
|
|
444
|
-
avg_log2FC = numeric(),
|
|
445
|
-
pct.1 = numeric(),
|
|
446
|
-
pct.2 = numeric(),
|
|
447
|
-
p_val_adj=numeric(),
|
|
448
|
-
cluster = character()
|
|
449
|
-
)
|
|
450
|
-
})
|
|
451
|
-
}
|
|
452
|
-
|
|
453
419
|
if (is.null(case$dotplot$assay)) {
|
|
454
420
|
case$dotplot$assay <- case$assay
|
|
455
421
|
}
|
|
@@ -483,6 +449,63 @@ do_case_findall <- function(casename) {
|
|
|
483
449
|
}
|
|
484
450
|
}
|
|
485
451
|
|
|
452
|
+
find_markers <- function(findmarkers_args, find_all = FALSE) {
|
|
453
|
+
if (find_all) {
|
|
454
|
+
fun <- FindAllMarkers
|
|
455
|
+
empty <- data.frame(
|
|
456
|
+
gene = character(),
|
|
457
|
+
p_val = numeric(),
|
|
458
|
+
avg_log2FC = numeric(),
|
|
459
|
+
pct.1 = numeric(),
|
|
460
|
+
pct.2 = numeric(),
|
|
461
|
+
p_val_adj = numeric(),
|
|
462
|
+
cluster = character()
|
|
463
|
+
)
|
|
464
|
+
} else {
|
|
465
|
+
fun <- FindMarkers
|
|
466
|
+
empty <- data.frame(
|
|
467
|
+
gene = character(),
|
|
468
|
+
p_val = numeric(),
|
|
469
|
+
avg_log2FC = numeric(),
|
|
470
|
+
pct.1 = numeric(),
|
|
471
|
+
pct.2 = numeric(),
|
|
472
|
+
p_val_adj = numeric()
|
|
473
|
+
)
|
|
474
|
+
}
|
|
475
|
+
markers <- tryCatch({
|
|
476
|
+
do_call(fun, findmarkers_args) %>% rownames_to_column("gene")
|
|
477
|
+
}, error = function(e) {
|
|
478
|
+
# Object contains multiple models with unequal library sizes.
|
|
479
|
+
# Run `PrepSCTFindMarkers()` before running `FindMarkers()`.
|
|
480
|
+
if (grepl("PrepSCTFindMarkers", e$message)) {
|
|
481
|
+
log_warn(" Running PrepSCTFindMarkers ...")
|
|
482
|
+
findmarkers_args$object <<- PrepSCTFindMarkers(findmarkers_args$object)
|
|
483
|
+
tryCatch({
|
|
484
|
+
do_call(fun, findmarkers_args) %>% rownames_to_column("gene")
|
|
485
|
+
}, error = function(err) {
|
|
486
|
+
log_warn(paste0(" ", err$message))
|
|
487
|
+
empty
|
|
488
|
+
})
|
|
489
|
+
} else {
|
|
490
|
+
log_warn(paste0(" ", e$message))
|
|
491
|
+
empty
|
|
492
|
+
}
|
|
493
|
+
})
|
|
494
|
+
|
|
495
|
+
if (nrow(markers) == 0 && defassay == "SCT") {
|
|
496
|
+
log_warn(" No markers found from SCT assay, trying recorrect_umi = FALSE")
|
|
497
|
+
findmarkers_args$recorrect_umi <- FALSE
|
|
498
|
+
markers <- tryCatch({
|
|
499
|
+
do_call(fun, findmarkers_args) %>% rownames_to_column("gene")
|
|
500
|
+
}, error = function(e) {
|
|
501
|
+
log_warn(paste0(" ", e$message))
|
|
502
|
+
empty
|
|
503
|
+
})
|
|
504
|
+
}
|
|
505
|
+
|
|
506
|
+
markers
|
|
507
|
+
}
|
|
508
|
+
|
|
486
509
|
sections <- c()
|
|
487
510
|
do_case <- function(casename) {
|
|
488
511
|
if (isTRUE(cases[[casename]]$findall)) {
|
|
@@ -538,38 +561,7 @@ do_case <- function(casename) {
|
|
|
538
561
|
# args$min.cells.feature <- args$min.cells.feature %||% 1
|
|
539
562
|
# args$min.pct <- args$min.pct %||% 0
|
|
540
563
|
|
|
541
|
-
markers <-
|
|
542
|
-
do_call(FindMarkers, args) %>% rownames_to_column("gene")
|
|
543
|
-
}, error = function(e) {
|
|
544
|
-
log_warn(paste0(" ", e$message))
|
|
545
|
-
data.frame(
|
|
546
|
-
gene = character(),
|
|
547
|
-
p_val = numeric(),
|
|
548
|
-
avg_log2FC = numeric(),
|
|
549
|
-
pct.1 = numeric(),
|
|
550
|
-
pct.2 = numeric(),
|
|
551
|
-
p_val_adj = numeric()
|
|
552
|
-
)
|
|
553
|
-
})
|
|
554
|
-
|
|
555
|
-
if (nrow(markers) == 0 && defassay == "SCT") {
|
|
556
|
-
log_warn(" No markers found from SCT assay, trying recorrect_umi = FALSE")
|
|
557
|
-
args$recorrect_umi <- FALSE
|
|
558
|
-
markers <- tryCatch({
|
|
559
|
-
do_call(FindMarkers, args) %>% rownames_to_column("gene")
|
|
560
|
-
}, error = function(e) {
|
|
561
|
-
log_warn(paste0(" ", e$message))
|
|
562
|
-
data.frame(
|
|
563
|
-
gene = character(),
|
|
564
|
-
p_val = numeric(),
|
|
565
|
-
avg_log2FC = numeric(),
|
|
566
|
-
pct.1 = numeric(),
|
|
567
|
-
pct.2 = numeric(),
|
|
568
|
-
p_val_adj=numeric()
|
|
569
|
-
)
|
|
570
|
-
})
|
|
571
|
-
}
|
|
572
|
-
|
|
564
|
+
markers <- find_markers(args)
|
|
573
565
|
siggenes <- do_enrich(info, markers, case$sigmarkers, case$volcano_genes)
|
|
574
566
|
|
|
575
567
|
if (length(siggenes) > 0) {
|
|
@@ -3,9 +3,11 @@ source("{{biopipen_dir}}/utils/caching.R")
|
|
|
3
3
|
|
|
4
4
|
library(Seurat)
|
|
5
5
|
library(future)
|
|
6
|
+
library(rlang)
|
|
6
7
|
library(tidyr)
|
|
7
8
|
library(dplyr)
|
|
8
9
|
library(digest)
|
|
10
|
+
library(clustree)
|
|
9
11
|
|
|
10
12
|
set.seed(8525)
|
|
11
13
|
|
|
@@ -129,39 +131,71 @@ if (is.null(cached$data)) {
|
|
|
129
131
|
}
|
|
130
132
|
|
|
131
133
|
envs$FindClusters$random.seed <- envs$FindClusters$random.seed %||% 8525
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
134
|
+
expand_resolution <- function(resolution) {
|
|
135
|
+
expanded_res <- c()
|
|
136
|
+
for (res in resolution) {
|
|
137
|
+
if (is.numeric(res)) {
|
|
138
|
+
expanded_res <- c(expanded_res, res)
|
|
139
|
+
} else {
|
|
140
|
+
# is.character
|
|
141
|
+
parts <- trimws(unlist(strsplit(res, ",")))
|
|
142
|
+
for (part in parts) {
|
|
143
|
+
if (grepl(":", part)) {
|
|
144
|
+
parts <- trimws(unlist(strsplit(part, ":")))
|
|
145
|
+
if (length(parts) == 2) { parts <- c(parts, 0.1) }
|
|
146
|
+
if (length(parts) != 3) {
|
|
147
|
+
stop("Invalid resolution format: {part}. Expected 2 or 3 parts separated by ':' for a range.")
|
|
148
|
+
}
|
|
149
|
+
parts <- as.numeric(parts)
|
|
150
|
+
expanded_res <- c(expanded_res, seq(parts[1], parts[2], by = parts[3]))
|
|
151
|
+
} else {
|
|
152
|
+
expanded_res <- c(expanded_res, as.numeric(part))
|
|
153
|
+
}
|
|
154
|
+
}
|
|
155
|
+
}
|
|
138
156
|
}
|
|
157
|
+
# keep the last resolution at last
|
|
158
|
+
rev(unique(rev(expanded_res)))
|
|
139
159
|
}
|
|
160
|
+
resolution <- envs$FindClusters$resolution <- expand_resolution(envs$FindClusters$resolution %||% 0.8)
|
|
161
|
+
log_info("Running FindClusters at resolution: {paste(resolution, collapse=',')} ...")
|
|
162
|
+
|
|
163
|
+
envs$FindClusters$object <- sobj
|
|
164
|
+
sobj <- do_call(FindClusters, envs$FindClusters)
|
|
140
165
|
|
|
166
|
+
# recode clusters from 0, 1, 2, ... to c1, c2, c3, ...
|
|
167
|
+
recode_clusters <- function(clusters) {
|
|
168
|
+
recode <- function(x) paste0("c", as.integer(as.character(x)) + 1)
|
|
169
|
+
clusters <- factor(recode(clusters), levels = recode(levels(clusters)))
|
|
170
|
+
clusters
|
|
171
|
+
}
|
|
172
|
+
|
|
173
|
+
graph_name <- envs$FindClusters$graph.name %||% paste0(DefaultAssay(sobj), "_snn_res.")
|
|
141
174
|
for (res in resolution) {
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
175
|
+
cluster_name <- paste0(graph_name, res)
|
|
176
|
+
new_cluster_name <- paste0("seurat_clusters.", res)
|
|
177
|
+
sobj@meta.data[[new_cluster_name]] <- recode_clusters(sobj@meta.data[[cluster_name]])
|
|
178
|
+
}
|
|
179
|
+
sobj@meta.data$seurat_clusters <- recode_clusters(sobj@meta.data$seurat_clusters)
|
|
180
|
+
Idents(sobj) <- "seurat_clusters"
|
|
181
|
+
|
|
182
|
+
ident_table <- table(Idents(sobj))
|
|
183
|
+
log_info("- Found {length(ident_table)} clusters at resolution {resolution[length(resolution)]}")
|
|
184
|
+
print(ident_table)
|
|
185
|
+
cat("\n")
|
|
186
|
+
|
|
187
|
+
# plot the tree
|
|
188
|
+
if (length(resolution) > 1) {
|
|
189
|
+
log_info("Plotting clustree ...")
|
|
190
|
+
png(
|
|
191
|
+
file.path(joboutdir, "clustree.png"),
|
|
192
|
+
res = envs$clustree_devpars$res,
|
|
193
|
+
width = envs$clustree_devpars$width,
|
|
194
|
+
height = envs$clustree_devpars$height
|
|
195
|
+
)
|
|
196
|
+
p <- clustree(sobj, prefix = "seurat_clusters.")
|
|
197
|
+
print(p)
|
|
198
|
+
dev.off()
|
|
165
199
|
}
|
|
166
200
|
|
|
167
201
|
if (DefaultAssay(sobj) == "SCT") {
|
|
@@ -63,6 +63,26 @@ if (endsWith(ref, ".rds") || endsWith(ref, ".RDS")) {
|
|
|
63
63
|
reference = LoadH5Seurat(ref)
|
|
64
64
|
}
|
|
65
65
|
|
|
66
|
+
# check if refdata exists in the reference
|
|
67
|
+
for (rname in names(mapquery_args$refdata)) {
|
|
68
|
+
use_name <- mapquery_args$refdata[[rname]]
|
|
69
|
+
# transferring an assay
|
|
70
|
+
if (use_name %in% names(reference)) { next }
|
|
71
|
+
# transferring a metadata column
|
|
72
|
+
if (!use_name %in% colnames(reference@meta.data)) {
|
|
73
|
+
stop(paste0(
|
|
74
|
+
"The reference does not have the column '",
|
|
75
|
+
use_name,
|
|
76
|
+
"' in either assays or metadata. "
|
|
77
|
+
))
|
|
78
|
+
if (startsWith(use_name, "predicted.")) {
|
|
79
|
+
stop(paste0(
|
|
80
|
+
"Do you mean: ", substring(use_name, 11),
|
|
81
|
+
))
|
|
82
|
+
}
|
|
83
|
+
}
|
|
84
|
+
}
|
|
85
|
+
|
|
66
86
|
if (refnorm == "auto" && DefaultAssay(reference) == "SCT") {
|
|
67
87
|
refnorm = "SCTransform"
|
|
68
88
|
}
|
|
@@ -8,6 +8,7 @@ library(tidyr)
|
|
|
8
8
|
library(dplyr)
|
|
9
9
|
library(tidyseurat)
|
|
10
10
|
library(digest)
|
|
11
|
+
library(clustree)
|
|
11
12
|
|
|
12
13
|
set.seed(8525)
|
|
13
14
|
|
|
@@ -28,6 +29,40 @@ plan(strategy = "multicore", workers = envs$ncores)
|
|
|
28
29
|
args
|
|
29
30
|
}
|
|
30
31
|
|
|
32
|
+
.expand_resolution <- function(resolution) {
|
|
33
|
+
expanded_res <- c()
|
|
34
|
+
for (res in resolution) {
|
|
35
|
+
if (is.numeric(res)) {
|
|
36
|
+
expanded_res <- c(expanded_res, res)
|
|
37
|
+
} else {
|
|
38
|
+
# is.character
|
|
39
|
+
parts <- trimws(unlist(strsplit(res, ",")))
|
|
40
|
+
for (part in parts) {
|
|
41
|
+
if (grepl(":", part)) {
|
|
42
|
+
parts <- trimws(unlist(strsplit(part, ":")))
|
|
43
|
+
if (length(parts) == 2) { parts <- c(parts, 0.1) }
|
|
44
|
+
if (length(parts) != 3) {
|
|
45
|
+
stop("Invalid resolution format: {part}. Expected 2 or 3 parts separated by ':' for a range.")
|
|
46
|
+
}
|
|
47
|
+
parts <- as.numeric(parts)
|
|
48
|
+
expanded_res <- c(expanded_res, seq(parts[1], parts[2], by = parts[3]))
|
|
49
|
+
} else {
|
|
50
|
+
expanded_res <- c(expanded_res, as.numeric(part))
|
|
51
|
+
}
|
|
52
|
+
}
|
|
53
|
+
}
|
|
54
|
+
}
|
|
55
|
+
# keep the last resolution at last
|
|
56
|
+
rev(unique(rev(expanded_res)))
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
# recode clusters from 0, 1, 2, ... to s1, s2, s3, ...
|
|
60
|
+
.recode_clusters <- function(clusters) {
|
|
61
|
+
recode <- function(x) paste0("s", as.integer(as.character(x)) + 1)
|
|
62
|
+
clusters <- factor(recode(clusters), levels = recode(levels(clusters)))
|
|
63
|
+
clusters
|
|
64
|
+
}
|
|
65
|
+
|
|
31
66
|
envs$RunUMAP <- .expand_dims(envs$RunUMAP)
|
|
32
67
|
envs$FindNeighbors <- .expand_dims(envs$FindNeighbors)
|
|
33
68
|
|
|
@@ -63,7 +98,8 @@ for (key in names(envs$cases)) {
|
|
|
63
98
|
subset = envs$subset,
|
|
64
99
|
RunUMAP = envs$RunUMAP,
|
|
65
100
|
FindNeighbors = envs$FindNeighbors,
|
|
66
|
-
FindClusters = envs$FindClusters
|
|
101
|
+
FindClusters = envs$FindClusters,
|
|
102
|
+
clustree_devpars = envs$clustree_devpars
|
|
67
103
|
),
|
|
68
104
|
case
|
|
69
105
|
)
|
|
@@ -132,36 +168,49 @@ for (key in names(envs$cases)) {
|
|
|
132
168
|
}
|
|
133
169
|
|
|
134
170
|
case$FindClusters$random.seed <- case$FindClusters$random.seed %||% 8525
|
|
135
|
-
resolution <- case$FindClusters$resolution %||% 0.8
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
171
|
+
resolution <- case$FindClusters$resolution <- .expand_resolution(case$FindClusters$resolution %||% 0.8)
|
|
172
|
+
cached <- get_cached(case$FindClusters, "FindClusters", cache_dir)
|
|
173
|
+
if (is.null(cached$data)) {
|
|
174
|
+
log_info("- Running FindClusters at resolution: {paste(resolution, collapse = ',')} ...")
|
|
175
|
+
case$FindClusters$object <- sobj
|
|
176
|
+
# avoid overwriting the previous clustering results (as they have the same graph name
|
|
177
|
+
sobj1 <- do_call(FindClusters, case$FindClusters)
|
|
178
|
+
graph_name <- case$FindClusters$graph.name %||% paste0(DefaultAssay(sobj), "_snn_res.")
|
|
179
|
+
for (res in resolution) {
|
|
180
|
+
cluster_name <- paste0(graph_name, res)
|
|
181
|
+
new_cluster_name <- paste0(key, ".", res)
|
|
182
|
+
sobj1@meta.data[[new_cluster_name]] <- .recode_clusters(sobj1@meta.data[[cluster_name]])
|
|
141
183
|
}
|
|
184
|
+
sobj1@meta.data[[key]] <- .recode_clusters(sobj1@meta.data$seurat_clusters)
|
|
185
|
+
keys <- sapply(resolution, function(res) paste0(key, ".", res))
|
|
186
|
+
keys <- c(keys, key)
|
|
187
|
+
cached$data <- sobj1@meta.data[, keys, drop = FALSE]
|
|
188
|
+
save_to_cache(cached, "FindClusters", cache_dir)
|
|
189
|
+
rm(sobj1)
|
|
190
|
+
} else {
|
|
191
|
+
log_info("- Using cached FindClusters at resolution: {paste(resolution, collapse = ',')} ...")
|
|
142
192
|
}
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
print(ident_table)
|
|
161
|
-
cat("\n")
|
|
193
|
+
|
|
194
|
+
ident_table <- table(cached$data[[key]])
|
|
195
|
+
log_info(" Found {length(ident_table)} clusters")
|
|
196
|
+
print(ident_table)
|
|
197
|
+
cat("\n")
|
|
198
|
+
|
|
199
|
+
if (length(resolution) > 1) {
|
|
200
|
+
log_info("- Plotting clustree ...")
|
|
201
|
+
png(
|
|
202
|
+
file.path(joboutdir, paste0(key, ".clustree.png")),
|
|
203
|
+
res = case$clustree_devpars$res,
|
|
204
|
+
width = case$clustree_devpars$width,
|
|
205
|
+
height = case$clustree_devpars$height
|
|
206
|
+
)
|
|
207
|
+
p <- clustree(cached$data, prefix = paste0(key, "."))
|
|
208
|
+
print(p)
|
|
209
|
+
dev.off()
|
|
162
210
|
}
|
|
211
|
+
|
|
163
212
|
log_info("- Updating meta.data with subclusters...")
|
|
164
|
-
srtobj <- AddMetaData(srtobj, metadata = cached$data
|
|
213
|
+
srtobj <- AddMetaData(srtobj, metadata = cached$data)
|
|
165
214
|
srtobj[[paste0("sub_umap_", key)]] <- reduc
|
|
166
215
|
}
|
|
167
216
|
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
source("{{biopipen_dir}}/utils/misc.R")
|
|
2
2
|
library(rlang)
|
|
3
|
+
library(rtracklayer)
|
|
3
4
|
library(MatrixEQTL)
|
|
4
5
|
|
|
5
6
|
snpfile = {{in.geno | r}}
|
|
@@ -11,6 +12,7 @@ outfile = {{out.cisqtls | r}}
|
|
|
11
12
|
|
|
12
13
|
model = {{envs.model | r}}
|
|
13
14
|
pval = {{envs.pval | r}}
|
|
15
|
+
match_samples = {{envs.match_samples | r}}
|
|
14
16
|
transp = {{envs.transp | r}}
|
|
15
17
|
fdr = {{envs.fdr | r}}
|
|
16
18
|
snppos = {{envs.snppos | r}}
|
|
@@ -36,7 +38,9 @@ if (!trans_enabled && !cis_enabled) {
|
|
|
36
38
|
transp <- 1e-5
|
|
37
39
|
}
|
|
38
40
|
|
|
39
|
-
transpose_file <- function(file) {
|
|
41
|
+
transpose_file <- function(file, what) {
|
|
42
|
+
if (is.null(file)) return(NULL)
|
|
43
|
+
log_info("Transposing {what} file ...")
|
|
40
44
|
out <- file.path(joboutdir, paste0(
|
|
41
45
|
tools::file_path_sans_ext(basename(file)),
|
|
42
46
|
".transposed.",
|
|
@@ -47,10 +51,11 @@ transpose_file <- function(file) {
|
|
|
47
51
|
out
|
|
48
52
|
}
|
|
49
53
|
|
|
50
|
-
if (transpose_geno) snpfile = transpose_file(snpfile)
|
|
51
|
-
if (transpose_expr) expfile = transpose_file(expfile)
|
|
52
|
-
if (transpose_cov) covfile = transpose_file(covfile)
|
|
54
|
+
if (transpose_geno) snpfile = transpose_file(snpfile, "geno")
|
|
55
|
+
if (transpose_expr) expfile = transpose_file(expfile, "expr")
|
|
56
|
+
if (transpose_cov) covfile = transpose_file(covfile, "cov")
|
|
53
57
|
|
|
58
|
+
log_info("Loading SNP data ...")
|
|
54
59
|
snps = SlicedData$new();
|
|
55
60
|
snps$fileDelimiter = "\t"; # the TAB character
|
|
56
61
|
snps$fileOmitCharacters = "NA"; # denote missing values;
|
|
@@ -59,6 +64,7 @@ snps$fileSkipColumns = 1; # one column of row labels
|
|
|
59
64
|
snps$fileSliceSize = 10000; # read file in pieces of 2,000 rows
|
|
60
65
|
snps$LoadFile( snpfile );
|
|
61
66
|
|
|
67
|
+
log_info("Loading gene expression data ...")
|
|
62
68
|
gene = SlicedData$new();
|
|
63
69
|
gene$fileDelimiter = "\t"; # the TAB character
|
|
64
70
|
gene$fileOmitCharacters = "NA"; # denote missing values;
|
|
@@ -69,16 +75,39 @@ gene$LoadFile( expfile );
|
|
|
69
75
|
|
|
70
76
|
cvrt = SlicedData$new();
|
|
71
77
|
if (!is.null(covfile) && file.exists(covfile)) {
|
|
72
|
-
|
|
78
|
+
log_info("Loading covariate data ...")
|
|
79
|
+
covmatrix = read.table(covfile, header=TRUE, stringsAsFactors=FALSE, row.names=1, sep="\t", quote="", check.names=FALSE)
|
|
73
80
|
cvrt$CreateFromMatrix( as.matrix(covmatrix) )
|
|
74
81
|
}
|
|
75
82
|
|
|
83
|
+
log_info("Matching samples ...")
|
|
84
|
+
if (match_samples) {
|
|
85
|
+
# let matrixEQTL raise an error if samples do not match
|
|
86
|
+
} else {
|
|
87
|
+
n_sample_snps = snps$nCols()
|
|
88
|
+
n_sample_gene = gene$nCols()
|
|
89
|
+
common_samples = intersect(snps$columnNames, gene$columnNames)
|
|
90
|
+
if (!is.null(covfile)) {
|
|
91
|
+
common_samples = intersect(common_samples, cvrt$columnNames)
|
|
92
|
+
n_sample_cov = cvrt$nCols()
|
|
93
|
+
cvrt = cvrt$ColumnSubsample(match(common_samples, cvrt$columnNames))
|
|
94
|
+
}
|
|
95
|
+
snps = snps$ColumnSubsample(match(common_samples, snps$columnNames))
|
|
96
|
+
gene = gene$ColumnSubsample(match(common_samples, gene$columnNames))
|
|
97
|
+
log_info("- Samples used in SNP data: {n_sample_snps} -> {snps$nCols()}")
|
|
98
|
+
log_info("- Samples used in gene expression data: {n_sample_gene} -> {gene$nCols()}")
|
|
99
|
+
if (!is.null(covfile)) {
|
|
100
|
+
log_info("- Samples used in covariate data: {n_sample_cov} -> {cvrt$nCols()}")
|
|
101
|
+
}
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
log_info("Composing engine parameters ...")
|
|
76
105
|
engine_params = list()
|
|
77
106
|
engine_params$snps = snps
|
|
78
107
|
engine_params$gene = gene
|
|
79
108
|
engine_params$cvrt = cvrt
|
|
80
|
-
engine_params$output_file_name =
|
|
81
|
-
engine_params$pvOutputThreshold =
|
|
109
|
+
engine_params$output_file_name = if(trans_enabled) alleqtl else NULL
|
|
110
|
+
engine_params$pvOutputThreshold = if(trans_enabled) transp else 0
|
|
82
111
|
engine_params$useModel = model
|
|
83
112
|
engine_params$errorCovariance = numeric()
|
|
84
113
|
engine_params$verbose = TRUE
|
|
@@ -89,66 +118,78 @@ noq = function(s) {
|
|
|
89
118
|
}
|
|
90
119
|
|
|
91
120
|
if (cis_enabled) {
|
|
121
|
+
log_info("Loading SNP positions ...")
|
|
92
122
|
if (endsWith(snppos, ".bed")) {
|
|
93
|
-
snppos_data = read.table
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
123
|
+
snppos_data = read.table(snppos, header = FALSE, stringsAsFactors = FALSE, sep = "\t")
|
|
124
|
+
snppos_data = data.frame(
|
|
125
|
+
snp = snppos_data$V4,
|
|
126
|
+
chr = snppos_data$V1,
|
|
127
|
+
pos = snppos_data$V3
|
|
128
|
+
)
|
|
97
129
|
} else if (endsWith(snppos, ".gff") || endsWith(snppos, ".gtf")) {
|
|
98
|
-
snppos_data =
|
|
99
|
-
|
|
100
|
-
snppos_data =
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
return(noq(substring(s, 8)))
|
|
106
|
-
else if (startsWith(s, "rs_id "))
|
|
107
|
-
return(noq(substring(s, 7)))
|
|
108
|
-
else if (startsWith(s, "rs "))
|
|
109
|
-
return(noq(substring(s, 4)))
|
|
110
|
-
}
|
|
111
|
-
}))
|
|
130
|
+
snppos_data = import(snppos)
|
|
131
|
+
elem_meta = elementMetadata(snppos_data)
|
|
132
|
+
snppos_data = data.frame(
|
|
133
|
+
snp = elem_meta$snp_id %||% elem_meta$rs_id %||% elem_meta$rs,
|
|
134
|
+
chr = as.character(seqnames(snppos_data)),
|
|
135
|
+
pos = start(snppos_data)
|
|
136
|
+
)
|
|
112
137
|
} else if (endsWith(snppos, ".vcf") || endsWith(snppos, ".vcf.gz")) {
|
|
113
|
-
snppos_data = read.table
|
|
114
|
-
|
|
138
|
+
snppos_data = read.table(
|
|
139
|
+
snppos,
|
|
140
|
+
header=FALSE,
|
|
141
|
+
row.names=NULL,
|
|
142
|
+
stringsAsFactors=FALSE,
|
|
143
|
+
check.names=FALSE
|
|
144
|
+
)
|
|
115
145
|
snppos_data = snppos_data[, c(3, 1, 2)]
|
|
116
146
|
colnames(snppos_data) = c("snp", "chr", "pos")
|
|
117
147
|
} else {
|
|
118
|
-
snppos_data = read.table
|
|
148
|
+
snppos_data = read.table(
|
|
149
|
+
snppos,
|
|
150
|
+
header=FALSE,
|
|
151
|
+
row.names=NULL,
|
|
152
|
+
stringsAsFactors=FALSE,
|
|
153
|
+
check.names=FALSE
|
|
154
|
+
)
|
|
119
155
|
colnames(snppos_data) = c("snp", "chr", "pos")
|
|
120
156
|
}
|
|
121
157
|
|
|
158
|
+
log_info("Loading gene positions ...")
|
|
122
159
|
if (endsWith(genepos, ".bed")) {
|
|
123
|
-
genepos_data = read.table
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
160
|
+
genepos_data = read.table(genepos, header = FALSE, stringsAsFactors = FALSE, sep = "\t")
|
|
161
|
+
genepos_data = data.frame(
|
|
162
|
+
geneid = genepos_data$V4,
|
|
163
|
+
chr = genepos_data$V1,
|
|
164
|
+
s1 = genepos_data$V2,
|
|
165
|
+
s2 = genepos_data$V3
|
|
166
|
+
)
|
|
127
167
|
} else if (endsWith(genepos, ".gff") || endsWith(genepos, ".gtf")) {
|
|
128
|
-
genepos_data =
|
|
129
|
-
|
|
130
|
-
genepos_data =
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
}
|
|
137
|
-
})))
|
|
168
|
+
genepos_data = import(genepos)
|
|
169
|
+
elem_meta = elementMetadata(genepos_data)
|
|
170
|
+
genepos_data = data.frame(
|
|
171
|
+
geneid = elem_meta$gene_id %||% elem_meta$gene_name,
|
|
172
|
+
chr = as.character(seqnames(genepos_data)),
|
|
173
|
+
s1 = start(genepos_data),
|
|
174
|
+
s2 = end(genepos_data)
|
|
175
|
+
)
|
|
138
176
|
} else {
|
|
139
177
|
genepos_data = read.table(genepos, header = TRUE, stringsAsFactors = FALSE);
|
|
140
178
|
colnames(genepos_data) = c("geneid", "chr", "s1", "s2")
|
|
141
179
|
}
|
|
142
180
|
|
|
181
|
+
log_info("Running MatrixEQTL with cis-eQTLs enabled ...")
|
|
143
182
|
engine_params$output_file_name.cis = outfile
|
|
144
183
|
engine_params$pvOutputThreshold.cis = pval
|
|
145
184
|
engine_params$cisDist = dist
|
|
146
185
|
engine_params$snpspos = snppos_data
|
|
147
186
|
engine_params$genepos = genepos_data
|
|
148
187
|
do_call(Matrix_eQTL_main, engine_params)
|
|
188
|
+
if (!file.exists(alleqtl)) file.create(alleqtl)
|
|
149
189
|
} else {
|
|
190
|
+
log_info("Running MatrixEQTL without cis-eQTLs ...")
|
|
150
191
|
do_call(Matrix_eQTL_engine, engine_params)
|
|
151
|
-
file.create(outfile)
|
|
192
|
+
if (!file.exists(outfile)) file.create(outfile)
|
|
152
193
|
}
|
|
153
194
|
|
|
154
195
|
if (pval == 0) {
|