biopipen 0.28.0__py3-none-any.whl → 0.29.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of biopipen might be problematic. Click here for more details.

Files changed (83) hide show
  1. biopipen/__init__.py +1 -1
  2. biopipen/core/config.toml +8 -0
  3. biopipen/ns/bam.py +0 -2
  4. biopipen/ns/bed.py +35 -0
  5. biopipen/ns/cellranger_pipeline.py +5 -5
  6. biopipen/ns/cnv.py +18 -2
  7. biopipen/ns/cnvkit_pipeline.py +16 -11
  8. biopipen/ns/gene.py +68 -23
  9. biopipen/ns/misc.py +2 -15
  10. biopipen/ns/plot.py +146 -0
  11. biopipen/ns/regulation.py +214 -0
  12. biopipen/ns/scrna.py +15 -3
  13. biopipen/ns/snp.py +516 -8
  14. biopipen/ns/stats.py +74 -2
  15. biopipen/ns/vcf.py +196 -0
  16. biopipen/reports/snp/PlinkCallRate.svelte +24 -0
  17. biopipen/reports/snp/PlinkFreq.svelte +18 -0
  18. biopipen/reports/snp/PlinkHWE.svelte +18 -0
  19. biopipen/reports/snp/PlinkHet.svelte +18 -0
  20. biopipen/reports/snp/PlinkIBD.svelte +18 -0
  21. biopipen/scripts/bam/CNVpytor.py +144 -46
  22. biopipen/scripts/bed/BedtoolsIntersect.py +54 -0
  23. biopipen/scripts/bed/BedtoolsMerge.py +1 -1
  24. biopipen/scripts/cnv/AneuploidyScore.R +30 -7
  25. biopipen/scripts/cnv/AneuploidyScoreSummary.R +5 -2
  26. biopipen/scripts/cnv/TMADScore.R +21 -5
  27. biopipen/scripts/cnv/TMADScoreSummary.R +6 -2
  28. biopipen/scripts/cnvkit/CNVkitAccess.py +2 -1
  29. biopipen/scripts/cnvkit/CNVkitAutobin.py +3 -2
  30. biopipen/scripts/cnvkit/CNVkitBatch.py +1 -1
  31. biopipen/scripts/cnvkit/CNVkitCoverage.py +2 -1
  32. biopipen/scripts/cnvkit/CNVkitGuessBaits.py +1 -1
  33. biopipen/scripts/cnvkit/CNVkitHeatmap.py +1 -1
  34. biopipen/scripts/cnvkit/CNVkitReference.py +2 -1
  35. biopipen/scripts/gene/GeneNameConversion.R +65 -0
  36. biopipen/scripts/gene/GenePromoters.R +61 -0
  37. biopipen/scripts/misc/Shell.sh +15 -0
  38. biopipen/scripts/plot/Manhattan.R +140 -0
  39. biopipen/scripts/plot/QQPlot.R +62 -0
  40. biopipen/scripts/regulation/MotifAffinityTest.R +226 -0
  41. biopipen/scripts/regulation/MotifAffinityTest_AtSNP.R +126 -0
  42. biopipen/scripts/regulation/MotifAffinityTest_MotifBreakR.R +96 -0
  43. biopipen/scripts/regulation/MotifScan.py +159 -0
  44. biopipen/scripts/regulation/atSNP.R +33 -0
  45. biopipen/scripts/regulation/motifBreakR.R +1594 -0
  46. biopipen/scripts/scrna/CellsDistribution.R +2 -0
  47. biopipen/scripts/scrna/MarkersFinder.R +59 -67
  48. biopipen/scripts/scrna/SeuratClustering.R +63 -29
  49. biopipen/scripts/scrna/SeuratMap2Ref.R +20 -0
  50. biopipen/scripts/scrna/SeuratSubClustering.R +76 -27
  51. biopipen/scripts/snp/MatrixEQTL.R +84 -43
  52. biopipen/scripts/snp/Plink2GTMat.py +133 -0
  53. biopipen/scripts/snp/PlinkCallRate.R +190 -0
  54. biopipen/scripts/snp/PlinkFilter.py +100 -0
  55. biopipen/scripts/snp/PlinkFreq.R +298 -0
  56. biopipen/scripts/snp/PlinkFromVcf.py +78 -0
  57. biopipen/scripts/snp/PlinkHWE.R +80 -0
  58. biopipen/scripts/snp/PlinkHet.R +92 -0
  59. biopipen/scripts/snp/PlinkIBD.R +197 -0
  60. biopipen/scripts/snp/PlinkUpdateName.py +124 -0
  61. biopipen/scripts/stats/MetaPvalue.R +2 -1
  62. biopipen/scripts/stats/MetaPvalue1.R +70 -0
  63. biopipen/scripts/tcr/TCRClusterStats.R +12 -7
  64. biopipen/scripts/vcf/BcftoolsAnnotate.py +91 -0
  65. biopipen/scripts/vcf/BcftoolsFilter.py +90 -0
  66. biopipen/scripts/vcf/BcftoolsSort.py +113 -0
  67. biopipen/scripts/vcf/BcftoolsView.py +73 -0
  68. biopipen/scripts/vcf/VcfFix_utils.py +1 -1
  69. biopipen/scripts/vcf/bcftools_utils.py +52 -0
  70. biopipen/utils/gene.R +83 -37
  71. biopipen/utils/gene.py +108 -60
  72. biopipen/utils/misc.R +56 -0
  73. biopipen/utils/misc.py +5 -2
  74. biopipen/utils/reference.py +54 -10
  75. {biopipen-0.28.0.dist-info → biopipen-0.29.0.dist-info}/METADATA +2 -2
  76. {biopipen-0.28.0.dist-info → biopipen-0.29.0.dist-info}/RECORD +78 -50
  77. {biopipen-0.28.0.dist-info → biopipen-0.29.0.dist-info}/entry_points.txt +1 -1
  78. biopipen/ns/bcftools.py +0 -111
  79. biopipen/scripts/bcftools/BcftoolsAnnotate.py +0 -42
  80. biopipen/scripts/bcftools/BcftoolsFilter.py +0 -79
  81. biopipen/scripts/bcftools/BcftoolsSort.py +0 -19
  82. biopipen/scripts/gene/GeneNameConversion.py +0 -66
  83. {biopipen-0.28.0.dist-info → biopipen-0.29.0.dist-info}/WHEEL +0 -0
@@ -360,6 +360,7 @@ do_case <- function(name, case) {
360
360
  }
361
361
 
362
362
  log_info(" Merging and saving pie charts ...")
363
+ devpars = case$devpars
363
364
  # assemble and save pie chart plots
364
365
  res <- devpars$res %||% 100
365
366
  # legend, cells_by names
@@ -405,6 +406,7 @@ do_case <- function(name, case) {
405
406
  }
406
407
 
407
408
  col_fun <- colorRamp2(c(0, max(hmdata, na.rm = T)), c("lightyellow", "purple"))
409
+ hm_devpars <- case$hm_devpars
408
410
  hm_res <- hm_devpars$res %||% 100
409
411
  hm_width <- hm_devpars$width %||% (600 + 15 * length(unique(meta$seurat_clusters)) + extra_width)
410
412
  hm_height <- hm_devpars$height %||% (450 + 15 * cells_rows + extra_height)
@@ -411,45 +411,11 @@ do_case_findall <- function(casename) {
411
411
  log_info(" Using cached markers ...")
412
412
  markers <- cached$data
413
413
  } else {
414
- markers <- tryCatch({
415
- do_call(FindAllMarkers, args)
416
- # gene, p_val, avg_log2FC, pct.1, pct.2, p_val_adj, cluster
417
- }, error = function(e) {
418
- log_warn(e$message)
419
-
420
- data.frame(
421
- gene = character(),
422
- p_val = numeric(),
423
- avg_log2FC = numeric(),
424
- pct.1 = numeric(),
425
- pct.2 = numeric(),
426
- p_val_adj=numeric(),
427
- cluster = character()
428
- )
429
- })
414
+ markers <- find_markers(args, find_all = TRUE)
430
415
  cached$data <- markers
431
416
  save_to_cache(cached, "FindAllMarkers", cache)
432
417
  }
433
418
 
434
- if (nrow(markers) == 0 && defassay == "SCT") {
435
- log_warn(" No markers found from SCT assay, try recorrect_umi = FALSE")
436
- args$recorrect_umi <- FALSE
437
- markers <- tryCatch({
438
- do_call(FindAllMarkers, args)
439
- }, error = function(e) {
440
- log_warn(e$message)
441
- data.frame(
442
- gene = character(),
443
- p_val = numeric(),
444
- avg_log2FC = numeric(),
445
- pct.1 = numeric(),
446
- pct.2 = numeric(),
447
- p_val_adj=numeric(),
448
- cluster = character()
449
- )
450
- })
451
- }
452
-
453
419
  if (is.null(case$dotplot$assay)) {
454
420
  case$dotplot$assay <- case$assay
455
421
  }
@@ -483,6 +449,63 @@ do_case_findall <- function(casename) {
483
449
  }
484
450
  }
485
451
 
452
+ find_markers <- function(findmarkers_args, find_all = FALSE) {
453
+ if (find_all) {
454
+ fun <- FindAllMarkers
455
+ empty <- data.frame(
456
+ gene = character(),
457
+ p_val = numeric(),
458
+ avg_log2FC = numeric(),
459
+ pct.1 = numeric(),
460
+ pct.2 = numeric(),
461
+ p_val_adj = numeric(),
462
+ cluster = character()
463
+ )
464
+ } else {
465
+ fun <- FindMarkers
466
+ empty <- data.frame(
467
+ gene = character(),
468
+ p_val = numeric(),
469
+ avg_log2FC = numeric(),
470
+ pct.1 = numeric(),
471
+ pct.2 = numeric(),
472
+ p_val_adj = numeric()
473
+ )
474
+ }
475
+ markers <- tryCatch({
476
+ do_call(fun, findmarkers_args) %>% rownames_to_column("gene")
477
+ }, error = function(e) {
478
+ # Object contains multiple models with unequal library sizes.
479
+ # Run `PrepSCTFindMarkers()` before running `FindMarkers()`.
480
+ if (grepl("PrepSCTFindMarkers", e$message)) {
481
+ log_warn(" Running PrepSCTFindMarkers ...")
482
+ findmarkers_args$object <<- PrepSCTFindMarkers(findmarkers_args$object)
483
+ tryCatch({
484
+ do_call(fun, findmarkers_args) %>% rownames_to_column("gene")
485
+ }, error = function(err) {
486
+ log_warn(paste0(" ", err$message))
487
+ empty
488
+ })
489
+ } else {
490
+ log_warn(paste0(" ", e$message))
491
+ empty
492
+ }
493
+ })
494
+
495
+ if (nrow(markers) == 0 && defassay == "SCT") {
496
+ log_warn(" No markers found from SCT assay, trying recorrect_umi = FALSE")
497
+ findmarkers_args$recorrect_umi <- FALSE
498
+ markers <- tryCatch({
499
+ do_call(fun, findmarkers_args) %>% rownames_to_column("gene")
500
+ }, error = function(e) {
501
+ log_warn(paste0(" ", e$message))
502
+ empty
503
+ })
504
+ }
505
+
506
+ markers
507
+ }
508
+
486
509
  sections <- c()
487
510
  do_case <- function(casename) {
488
511
  if (isTRUE(cases[[casename]]$findall)) {
@@ -538,38 +561,7 @@ do_case <- function(casename) {
538
561
  # args$min.cells.feature <- args$min.cells.feature %||% 1
539
562
  # args$min.pct <- args$min.pct %||% 0
540
563
 
541
- markers <- tryCatch({
542
- do_call(FindMarkers, args) %>% rownames_to_column("gene")
543
- }, error = function(e) {
544
- log_warn(paste0(" ", e$message))
545
- data.frame(
546
- gene = character(),
547
- p_val = numeric(),
548
- avg_log2FC = numeric(),
549
- pct.1 = numeric(),
550
- pct.2 = numeric(),
551
- p_val_adj = numeric()
552
- )
553
- })
554
-
555
- if (nrow(markers) == 0 && defassay == "SCT") {
556
- log_warn(" No markers found from SCT assay, trying recorrect_umi = FALSE")
557
- args$recorrect_umi <- FALSE
558
- markers <- tryCatch({
559
- do_call(FindMarkers, args) %>% rownames_to_column("gene")
560
- }, error = function(e) {
561
- log_warn(paste0(" ", e$message))
562
- data.frame(
563
- gene = character(),
564
- p_val = numeric(),
565
- avg_log2FC = numeric(),
566
- pct.1 = numeric(),
567
- pct.2 = numeric(),
568
- p_val_adj=numeric()
569
- )
570
- })
571
- }
572
-
564
+ markers <- find_markers(args)
573
565
  siggenes <- do_enrich(info, markers, case$sigmarkers, case$volcano_genes)
574
566
 
575
567
  if (length(siggenes) > 0) {
@@ -3,9 +3,11 @@ source("{{biopipen_dir}}/utils/caching.R")
3
3
 
4
4
  library(Seurat)
5
5
  library(future)
6
+ library(rlang)
6
7
  library(tidyr)
7
8
  library(dplyr)
8
9
  library(digest)
10
+ library(clustree)
9
11
 
10
12
  set.seed(8525)
11
13
 
@@ -129,39 +131,71 @@ if (is.null(cached$data)) {
129
131
  }
130
132
 
131
133
  envs$FindClusters$random.seed <- envs$FindClusters$random.seed %||% 8525
132
- resolution <- envs$FindClusters$resolution %||% 0.8
133
- if (is.character(resolution)) {
134
- if (grepl(",", resolution)) {
135
- resolution <- as.numeric(trimws(unlist(strsplit(resolution, ","))))
136
- } else {
137
- resolution <- as.numeric(resolution)
134
+ expand_resolution <- function(resolution) {
135
+ expanded_res <- c()
136
+ for (res in resolution) {
137
+ if (is.numeric(res)) {
138
+ expanded_res <- c(expanded_res, res)
139
+ } else {
140
+ # is.character
141
+ parts <- trimws(unlist(strsplit(res, ",")))
142
+ for (part in parts) {
143
+ if (grepl(":", part)) {
144
+ parts <- trimws(unlist(strsplit(part, ":")))
145
+ if (length(parts) == 2) { parts <- c(parts, 0.1) }
146
+ if (length(parts) != 3) {
147
+ stop("Invalid resolution format: {part}. Expected 2 or 3 parts separated by ':' for a range.")
148
+ }
149
+ parts <- as.numeric(parts)
150
+ expanded_res <- c(expanded_res, seq(parts[1], parts[2], by = parts[3]))
151
+ } else {
152
+ expanded_res <- c(expanded_res, as.numeric(part))
153
+ }
154
+ }
155
+ }
138
156
  }
157
+ # keep the last resolution at last
158
+ rev(unique(rev(expanded_res)))
139
159
  }
160
+ resolution <- envs$FindClusters$resolution <- expand_resolution(envs$FindClusters$resolution %||% 0.8)
161
+ log_info("Running FindClusters at resolution: {paste(resolution, collapse=',')} ...")
162
+
163
+ envs$FindClusters$object <- sobj
164
+ sobj <- do_call(FindClusters, envs$FindClusters)
140
165
 
166
+ # recode clusters from 0, 1, 2, ... to c1, c2, c3, ...
167
+ recode_clusters <- function(clusters) {
168
+ recode <- function(x) paste0("c", as.integer(as.character(x)) + 1)
169
+ clusters <- factor(recode(clusters), levels = recode(levels(clusters)))
170
+ clusters
171
+ }
172
+
173
+ graph_name <- envs$FindClusters$graph.name %||% paste0(DefaultAssay(sobj), "_snn_res.")
141
174
  for (res in resolution) {
142
- envs$FindClusters$resolution <- res
143
- cached <- get_cached(envs$FindClusters, paste0("FindClusters_", res), cache_dir)
144
- res_key <- paste0("seurat_clusters_", res)
145
- if (is.null(cached$data)) {
146
- log_info("Running FindClusters at resolution: {res} ...")
147
- envs$FindClusters$object <- sobj
148
- sobj <- do_call(FindClusters, envs$FindClusters)
149
- levels(sobj$seurat_clusters) <- paste0("c", as.numeric(levels(sobj$seurat_clusters)) + 1)
150
- sobj[[res_key]] <- sobj$seurat_clusters
151
- Idents(sobj) <- "seurat_clusters"
152
- cached$data <- list(clusters = sobj$seurat_clusters, commands = sobj@commands)
153
- save_to_cache(cached, paste0("FindClusters_", res), cache_dir)
154
- } else {
155
- log_info("Loading cached FindClusters at resolution: {res} ...")
156
- sobj@commands <- cached$data$commands
157
- sobj[[res_key]] <- cached$data$clusters
158
- sobj$seurat_clusters <- cached$data$clusters
159
- Idents(sobj) <- "seurat_clusters"
160
- }
161
- ident_table <- table(Idents(sobj))
162
- log_info("- Found {length(ident_table)} clusters")
163
- print(ident_table)
164
- cat("\n")
175
+ cluster_name <- paste0(graph_name, res)
176
+ new_cluster_name <- paste0("seurat_clusters.", res)
177
+ sobj@meta.data[[new_cluster_name]] <- recode_clusters(sobj@meta.data[[cluster_name]])
178
+ }
179
+ sobj@meta.data$seurat_clusters <- recode_clusters(sobj@meta.data$seurat_clusters)
180
+ Idents(sobj) <- "seurat_clusters"
181
+
182
+ ident_table <- table(Idents(sobj))
183
+ log_info("- Found {length(ident_table)} clusters at resolution {resolution[length(resolution)]}")
184
+ print(ident_table)
185
+ cat("\n")
186
+
187
+ # plot the tree
188
+ if (length(resolution) > 1) {
189
+ log_info("Plotting clustree ...")
190
+ png(
191
+ file.path(joboutdir, "clustree.png"),
192
+ res = envs$clustree_devpars$res,
193
+ width = envs$clustree_devpars$width,
194
+ height = envs$clustree_devpars$height
195
+ )
196
+ p <- clustree(sobj, prefix = "seurat_clusters.")
197
+ print(p)
198
+ dev.off()
165
199
  }
166
200
 
167
201
  if (DefaultAssay(sobj) == "SCT") {
@@ -63,6 +63,26 @@ if (endsWith(ref, ".rds") || endsWith(ref, ".RDS")) {
63
63
  reference = LoadH5Seurat(ref)
64
64
  }
65
65
 
66
+ # check if refdata exists in the reference
67
+ for (rname in names(mapquery_args$refdata)) {
68
+ use_name <- mapquery_args$refdata[[rname]]
69
+ # transferring an assay
70
+ if (use_name %in% names(reference)) { next }
71
+ # transferring a metadata column
72
+ if (!use_name %in% colnames(reference@meta.data)) {
73
+ stop(paste0(
74
+ "The reference does not have the column '",
75
+ use_name,
76
+ "' in either assays or metadata. "
77
+ ))
78
+ if (startsWith(use_name, "predicted.")) {
79
+ stop(paste0(
80
+ "Do you mean: ", substring(use_name, 11),
81
+ ))
82
+ }
83
+ }
84
+ }
85
+
66
86
  if (refnorm == "auto" && DefaultAssay(reference) == "SCT") {
67
87
  refnorm = "SCTransform"
68
88
  }
@@ -8,6 +8,7 @@ library(tidyr)
8
8
  library(dplyr)
9
9
  library(tidyseurat)
10
10
  library(digest)
11
+ library(clustree)
11
12
 
12
13
  set.seed(8525)
13
14
 
@@ -28,6 +29,40 @@ plan(strategy = "multicore", workers = envs$ncores)
28
29
  args
29
30
  }
30
31
 
32
+ .expand_resolution <- function(resolution) {
33
+ expanded_res <- c()
34
+ for (res in resolution) {
35
+ if (is.numeric(res)) {
36
+ expanded_res <- c(expanded_res, res)
37
+ } else {
38
+ # is.character
39
+ parts <- trimws(unlist(strsplit(res, ",")))
40
+ for (part in parts) {
41
+ if (grepl(":", part)) {
42
+ parts <- trimws(unlist(strsplit(part, ":")))
43
+ if (length(parts) == 2) { parts <- c(parts, 0.1) }
44
+ if (length(parts) != 3) {
45
+ stop("Invalid resolution format: {part}. Expected 2 or 3 parts separated by ':' for a range.")
46
+ }
47
+ parts <- as.numeric(parts)
48
+ expanded_res <- c(expanded_res, seq(parts[1], parts[2], by = parts[3]))
49
+ } else {
50
+ expanded_res <- c(expanded_res, as.numeric(part))
51
+ }
52
+ }
53
+ }
54
+ }
55
+ # keep the last resolution at last
56
+ rev(unique(rev(expanded_res)))
57
+ }
58
+
59
+ # recode clusters from 0, 1, 2, ... to s1, s2, s3, ...
60
+ .recode_clusters <- function(clusters) {
61
+ recode <- function(x) paste0("s", as.integer(as.character(x)) + 1)
62
+ clusters <- factor(recode(clusters), levels = recode(levels(clusters)))
63
+ clusters
64
+ }
65
+
31
66
  envs$RunUMAP <- .expand_dims(envs$RunUMAP)
32
67
  envs$FindNeighbors <- .expand_dims(envs$FindNeighbors)
33
68
 
@@ -63,7 +98,8 @@ for (key in names(envs$cases)) {
63
98
  subset = envs$subset,
64
99
  RunUMAP = envs$RunUMAP,
65
100
  FindNeighbors = envs$FindNeighbors,
66
- FindClusters = envs$FindClusters
101
+ FindClusters = envs$FindClusters,
102
+ clustree_devpars = envs$clustree_devpars
67
103
  ),
68
104
  case
69
105
  )
@@ -132,36 +168,49 @@ for (key in names(envs$cases)) {
132
168
  }
133
169
 
134
170
  case$FindClusters$random.seed <- case$FindClusters$random.seed %||% 8525
135
- resolution <- case$FindClusters$resolution %||% 0.8
136
- if (is.character(resolution)) {
137
- if (grepl(",", resolution)) {
138
- resolution <- as.numeric(trimws(unlist(strsplit(resolution, ","))))
139
- } else {
140
- resolution <- as.numeric(resolution)
171
+ resolution <- case$FindClusters$resolution <- .expand_resolution(case$FindClusters$resolution %||% 0.8)
172
+ cached <- get_cached(case$FindClusters, "FindClusters", cache_dir)
173
+ if (is.null(cached$data)) {
174
+ log_info("- Running FindClusters at resolution: {paste(resolution, collapse = ',')} ...")
175
+ case$FindClusters$object <- sobj
176
+ # avoid overwriting the previous clustering results (as they have the same graph name
177
+ sobj1 <- do_call(FindClusters, case$FindClusters)
178
+ graph_name <- case$FindClusters$graph.name %||% paste0(DefaultAssay(sobj), "_snn_res.")
179
+ for (res in resolution) {
180
+ cluster_name <- paste0(graph_name, res)
181
+ new_cluster_name <- paste0(key, ".", res)
182
+ sobj1@meta.data[[new_cluster_name]] <- .recode_clusters(sobj1@meta.data[[cluster_name]])
141
183
  }
184
+ sobj1@meta.data[[key]] <- .recode_clusters(sobj1@meta.data$seurat_clusters)
185
+ keys <- sapply(resolution, function(res) paste0(key, ".", res))
186
+ keys <- c(keys, key)
187
+ cached$data <- sobj1@meta.data[, keys, drop = FALSE]
188
+ save_to_cache(cached, "FindClusters", cache_dir)
189
+ rm(sobj1)
190
+ } else {
191
+ log_info("- Using cached FindClusters at resolution: {paste(resolution, collapse = ',')} ...")
142
192
  }
143
- for (res in resolution) {
144
- case$FindClusters$resolution <- res
145
- cached <- get_cached(case$FindClusters, paste0("FindClusters_", res), cache_dir)
146
- res_key <- paste0("seurat_clusters_", res)
147
- if (is.null(cached$data)) {
148
- log_info("- Running FindClusters at resolution: {res} ...")
149
- case$FindClusters$object <- sobj
150
- sobj1 <- do_call(FindClusters, case$FindClusters)
151
- levels(sobj1$seurat_clusters) <- paste0("s", as.numeric(levels(sobj1$seurat_clusters)) + 1)
152
- sobj1[[res_key]] <- sobj1$seurat_clusters
153
- cached$data <- sobj1@meta.data[, res_key, drop = FALSE]
154
- save_to_cache(cached, paste0("FindClusters_", res), cache_dir)
155
- } else {
156
- log_info("- Using cached FindClusters at resolution: {res} ...")
157
- }
158
- ident_table <- table(cached$data[[res_key]])
159
- log_info(" Found {length(ident_table)} clusters")
160
- print(ident_table)
161
- cat("\n")
193
+
194
+ ident_table <- table(cached$data[[key]])
195
+ log_info(" Found {length(ident_table)} clusters")
196
+ print(ident_table)
197
+ cat("\n")
198
+
199
+ if (length(resolution) > 1) {
200
+ log_info("- Plotting clustree ...")
201
+ png(
202
+ file.path(joboutdir, paste0(key, ".clustree.png")),
203
+ res = case$clustree_devpars$res,
204
+ width = case$clustree_devpars$width,
205
+ height = case$clustree_devpars$height
206
+ )
207
+ p <- clustree(cached$data, prefix = paste0(key, "."))
208
+ print(p)
209
+ dev.off()
162
210
  }
211
+
163
212
  log_info("- Updating meta.data with subclusters...")
164
- srtobj <- AddMetaData(srtobj, metadata = cached$data, col.name = key)
213
+ srtobj <- AddMetaData(srtobj, metadata = cached$data)
165
214
  srtobj[[paste0("sub_umap_", key)]] <- reduc
166
215
  }
167
216
 
@@ -1,5 +1,6 @@
1
1
  source("{{biopipen_dir}}/utils/misc.R")
2
2
  library(rlang)
3
+ library(rtracklayer)
3
4
  library(MatrixEQTL)
4
5
 
5
6
  snpfile = {{in.geno | r}}
@@ -11,6 +12,7 @@ outfile = {{out.cisqtls | r}}
11
12
 
12
13
  model = {{envs.model | r}}
13
14
  pval = {{envs.pval | r}}
15
+ match_samples = {{envs.match_samples | r}}
14
16
  transp = {{envs.transp | r}}
15
17
  fdr = {{envs.fdr | r}}
16
18
  snppos = {{envs.snppos | r}}
@@ -36,7 +38,9 @@ if (!trans_enabled && !cis_enabled) {
36
38
  transp <- 1e-5
37
39
  }
38
40
 
39
- transpose_file <- function(file) {
41
+ transpose_file <- function(file, what) {
42
+ if (is.null(file)) return(NULL)
43
+ log_info("Transposing {what} file ...")
40
44
  out <- file.path(joboutdir, paste0(
41
45
  tools::file_path_sans_ext(basename(file)),
42
46
  ".transposed.",
@@ -47,10 +51,11 @@ transpose_file <- function(file) {
47
51
  out
48
52
  }
49
53
 
50
- if (transpose_geno) snpfile = transpose_file(snpfile)
51
- if (transpose_expr) expfile = transpose_file(expfile)
52
- if (transpose_cov) covfile = transpose_file(covfile)
54
+ if (transpose_geno) snpfile = transpose_file(snpfile, "geno")
55
+ if (transpose_expr) expfile = transpose_file(expfile, "expr")
56
+ if (transpose_cov) covfile = transpose_file(covfile, "cov")
53
57
 
58
+ log_info("Loading SNP data ...")
54
59
  snps = SlicedData$new();
55
60
  snps$fileDelimiter = "\t"; # the TAB character
56
61
  snps$fileOmitCharacters = "NA"; # denote missing values;
@@ -59,6 +64,7 @@ snps$fileSkipColumns = 1; # one column of row labels
59
64
  snps$fileSliceSize = 10000; # read file in pieces of 2,000 rows
60
65
  snps$LoadFile( snpfile );
61
66
 
67
+ log_info("Loading gene expression data ...")
62
68
  gene = SlicedData$new();
63
69
  gene$fileDelimiter = "\t"; # the TAB character
64
70
  gene$fileOmitCharacters = "NA"; # denote missing values;
@@ -69,16 +75,39 @@ gene$LoadFile( expfile );
69
75
 
70
76
  cvrt = SlicedData$new();
71
77
  if (!is.null(covfile) && file.exists(covfile)) {
72
- covmatrix = t(read.table.inopts(covfile, list(cnames=TRUE, rnames=TRUE)))
78
+ log_info("Loading covariate data ...")
79
+ covmatrix = read.table(covfile, header=TRUE, stringsAsFactors=FALSE, row.names=1, sep="\t", quote="", check.names=FALSE)
73
80
  cvrt$CreateFromMatrix( as.matrix(covmatrix) )
74
81
  }
75
82
 
83
+ log_info("Matching samples ...")
84
+ if (match_samples) {
85
+ # let matrixEQTL raise an error if samples do not match
86
+ } else {
87
+ n_sample_snps = snps$nCols()
88
+ n_sample_gene = gene$nCols()
89
+ common_samples = intersect(snps$columnNames, gene$columnNames)
90
+ if (!is.null(covfile)) {
91
+ common_samples = intersect(common_samples, cvrt$columnNames)
92
+ n_sample_cov = cvrt$nCols()
93
+ cvrt = cvrt$ColumnSubsample(match(common_samples, cvrt$columnNames))
94
+ }
95
+ snps = snps$ColumnSubsample(match(common_samples, snps$columnNames))
96
+ gene = gene$ColumnSubsample(match(common_samples, gene$columnNames))
97
+ log_info("- Samples used in SNP data: {n_sample_snps} -> {snps$nCols()}")
98
+ log_info("- Samples used in gene expression data: {n_sample_gene} -> {gene$nCols()}")
99
+ if (!is.null(covfile)) {
100
+ log_info("- Samples used in covariate data: {n_sample_cov} -> {cvrt$nCols()}")
101
+ }
102
+ }
103
+
104
+ log_info("Composing engine parameters ...")
76
105
  engine_params = list()
77
106
  engine_params$snps = snps
78
107
  engine_params$gene = gene
79
108
  engine_params$cvrt = cvrt
80
- engine_params$output_file_name = ifelse(trans_enabled, alleqtl, NULL)
81
- engine_params$pvOutputThreshold = ifelse(trans_enabled, transp, 0)
109
+ engine_params$output_file_name = if(trans_enabled) alleqtl else NULL
110
+ engine_params$pvOutputThreshold = if(trans_enabled) transp else 0
82
111
  engine_params$useModel = model
83
112
  engine_params$errorCovariance = numeric()
84
113
  engine_params$verbose = TRUE
@@ -89,66 +118,78 @@ noq = function(s) {
89
118
  }
90
119
 
91
120
  if (cis_enabled) {
121
+ log_info("Loading SNP positions ...")
92
122
  if (endsWith(snppos, ".bed")) {
93
- snppos_data = read.table.inopts(snppos,
94
- list(cnames=FALSE, rnames=FALSE))
95
- snppos_data = snppos_data[, c(4, 1, 2)]
96
- colnames(snppos_data) = c("snp", "chr", "pos")
123
+ snppos_data = read.table(snppos, header = FALSE, stringsAsFactors = FALSE, sep = "\t")
124
+ snppos_data = data.frame(
125
+ snp = snppos_data$V4,
126
+ chr = snppos_data$V1,
127
+ pos = snppos_data$V3
128
+ )
97
129
  } else if (endsWith(snppos, ".gff") || endsWith(snppos, ".gtf")) {
98
- snppos_data = read.table.inopts(snppos,
99
- list(cnames=FALSE, rnames=FALSE));
100
- snppos_data = snppos_data[, c(9, 1, 4)]
101
- colnames(snppos_data) = c("snp", "chr", "pos")
102
- snppos_data$snp = unlist(lapply(snppos_data$snp, function(x) {
103
- for (s in unlist(strsplit(x, '; ', fixed=T))) {
104
- if (startsWith(s, "snp_id "))
105
- return(noq(substring(s, 8)))
106
- else if (startsWith(s, "rs_id "))
107
- return(noq(substring(s, 7)))
108
- else if (startsWith(s, "rs "))
109
- return(noq(substring(s, 4)))
110
- }
111
- }))
130
+ snppos_data = import(snppos)
131
+ elem_meta = elementMetadata(snppos_data)
132
+ snppos_data = data.frame(
133
+ snp = elem_meta$snp_id %||% elem_meta$rs_id %||% elem_meta$rs,
134
+ chr = as.character(seqnames(snppos_data)),
135
+ pos = start(snppos_data)
136
+ )
112
137
  } else if (endsWith(snppos, ".vcf") || endsWith(snppos, ".vcf.gz")) {
113
- snppos_data = read.table.inopts(snppos,
114
- list(cnames=FALSE, rnames=FALSE))
138
+ snppos_data = read.table(
139
+ snppos,
140
+ header=FALSE,
141
+ row.names=NULL,
142
+ stringsAsFactors=FALSE,
143
+ check.names=FALSE
144
+ )
115
145
  snppos_data = snppos_data[, c(3, 1, 2)]
116
146
  colnames(snppos_data) = c("snp", "chr", "pos")
117
147
  } else {
118
- snppos_data = read.table.inopts(snppos, list(cnames=TRUE))
148
+ snppos_data = read.table(
149
+ snppos,
150
+ header=FALSE,
151
+ row.names=NULL,
152
+ stringsAsFactors=FALSE,
153
+ check.names=FALSE
154
+ )
119
155
  colnames(snppos_data) = c("snp", "chr", "pos")
120
156
  }
121
157
 
158
+ log_info("Loading gene positions ...")
122
159
  if (endsWith(genepos, ".bed")) {
123
- genepos_data = read.table.inopts(genepos,
124
- list(cnames=FALSE, rnames=FALSE))
125
- genepos_data = genepos_data[, c(4, 1:3)]
126
- colnames(genepos_data) = c("geneid", "chr", "s1", "s2")
160
+ genepos_data = read.table(genepos, header = FALSE, stringsAsFactors = FALSE, sep = "\t")
161
+ genepos_data = data.frame(
162
+ geneid = genepos_data$V4,
163
+ chr = genepos_data$V1,
164
+ s1 = genepos_data$V2,
165
+ s2 = genepos_data$V3
166
+ )
127
167
  } else if (endsWith(genepos, ".gff") || endsWith(genepos, ".gtf")) {
128
- genepos_data = read.table.inopts(genepos,
129
- list(cnames=FALSE, rnames=FALSE))
130
- genepos_data = genepos_data[, c(9, 1, 4, 5)]
131
- colnames(genepos_data) = c("geneid", "chr", "s1", "s2")
132
- genepos_data$geneid = noquote(unlist(lapply(genepos_data$geneid, function(x) {
133
- for (s in unlist(strsplit(x, '; ', fixed=T))) {
134
- if (startsWith(s, "gene_id "))
135
- return(noq(substring(s, 9)))
136
- }
137
- })))
168
+ genepos_data = import(genepos)
169
+ elem_meta = elementMetadata(genepos_data)
170
+ genepos_data = data.frame(
171
+ geneid = elem_meta$gene_id %||% elem_meta$gene_name,
172
+ chr = as.character(seqnames(genepos_data)),
173
+ s1 = start(genepos_data),
174
+ s2 = end(genepos_data)
175
+ )
138
176
  } else {
139
177
  genepos_data = read.table(genepos, header = TRUE, stringsAsFactors = FALSE);
140
178
  colnames(genepos_data) = c("geneid", "chr", "s1", "s2")
141
179
  }
142
180
 
181
+ log_info("Running MatrixEQTL with cis-eQTLs enabled ...")
143
182
  engine_params$output_file_name.cis = outfile
144
183
  engine_params$pvOutputThreshold.cis = pval
145
184
  engine_params$cisDist = dist
146
185
  engine_params$snpspos = snppos_data
147
186
  engine_params$genepos = genepos_data
148
187
  do_call(Matrix_eQTL_main, engine_params)
188
+ if (!file.exists(alleqtl)) file.create(alleqtl)
149
189
  } else {
190
+ log_info("Running MatrixEQTL without cis-eQTLs ...")
150
191
  do_call(Matrix_eQTL_engine, engine_params)
151
- file.create(outfile)
192
+ if (!file.exists(outfile)) file.create(outfile)
152
193
  }
153
194
 
154
195
  if (pval == 0) {