biopipen 0.28.1__py3-none-any.whl → 0.29.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of biopipen might be problematic. Click here for more details.

Files changed (85) hide show
  1. biopipen/__init__.py +1 -1
  2. biopipen/core/config.toml +8 -0
  3. biopipen/ns/bam.py +0 -2
  4. biopipen/ns/bed.py +35 -0
  5. biopipen/ns/cellranger_pipeline.py +5 -5
  6. biopipen/ns/cnv.py +18 -2
  7. biopipen/ns/cnvkit_pipeline.py +16 -11
  8. biopipen/ns/gene.py +68 -23
  9. biopipen/ns/misc.py +2 -15
  10. biopipen/ns/plot.py +204 -0
  11. biopipen/ns/regulatory.py +214 -0
  12. biopipen/ns/scrna.py +31 -5
  13. biopipen/ns/snp.py +516 -8
  14. biopipen/ns/stats.py +167 -3
  15. biopipen/ns/vcf.py +196 -0
  16. biopipen/reports/snp/PlinkCallRate.svelte +24 -0
  17. biopipen/reports/snp/PlinkFreq.svelte +18 -0
  18. biopipen/reports/snp/PlinkHWE.svelte +18 -0
  19. biopipen/reports/snp/PlinkHet.svelte +18 -0
  20. biopipen/reports/snp/PlinkIBD.svelte +18 -0
  21. biopipen/scripts/bam/CNVpytor.py +144 -46
  22. biopipen/scripts/bed/BedtoolsIntersect.py +54 -0
  23. biopipen/scripts/bed/BedtoolsMerge.py +1 -1
  24. biopipen/scripts/cnv/AneuploidyScore.R +30 -7
  25. biopipen/scripts/cnv/AneuploidyScoreSummary.R +5 -2
  26. biopipen/scripts/cnv/TMADScore.R +21 -5
  27. biopipen/scripts/cnv/TMADScoreSummary.R +6 -2
  28. biopipen/scripts/cnvkit/CNVkitAccess.py +2 -1
  29. biopipen/scripts/cnvkit/CNVkitAutobin.py +3 -2
  30. biopipen/scripts/cnvkit/CNVkitBatch.py +1 -1
  31. biopipen/scripts/cnvkit/CNVkitCoverage.py +2 -1
  32. biopipen/scripts/cnvkit/CNVkitGuessBaits.py +1 -1
  33. biopipen/scripts/cnvkit/CNVkitHeatmap.py +1 -1
  34. biopipen/scripts/cnvkit/CNVkitReference.py +2 -1
  35. biopipen/scripts/delim/SampleInfo.R +10 -5
  36. biopipen/scripts/gene/GeneNameConversion.R +65 -0
  37. biopipen/scripts/gene/GenePromoters.R +61 -0
  38. biopipen/scripts/misc/Shell.sh +15 -0
  39. biopipen/scripts/plot/Manhattan.R +146 -0
  40. biopipen/scripts/plot/QQPlot.R +146 -0
  41. biopipen/scripts/regulatory/MotifAffinityTest.R +226 -0
  42. biopipen/scripts/regulatory/MotifAffinityTest_AtSNP.R +126 -0
  43. biopipen/scripts/regulatory/MotifAffinityTest_MotifBreakR.R +96 -0
  44. biopipen/scripts/regulatory/MotifScan.py +159 -0
  45. biopipen/scripts/regulatory/atSNP.R +33 -0
  46. biopipen/scripts/regulatory/motifBreakR.R +1594 -0
  47. biopipen/scripts/scrna/MarkersFinder.R +69 -67
  48. biopipen/scripts/scrna/SeuratClustering.R +71 -29
  49. biopipen/scripts/scrna/SeuratMap2Ref.R +20 -0
  50. biopipen/scripts/scrna/SeuratPreparing.R +252 -122
  51. biopipen/scripts/scrna/SeuratSubClustering.R +76 -27
  52. biopipen/scripts/snp/MatrixEQTL.R +85 -44
  53. biopipen/scripts/snp/Plink2GTMat.py +133 -0
  54. biopipen/scripts/snp/PlinkCallRate.R +190 -0
  55. biopipen/scripts/snp/PlinkFilter.py +100 -0
  56. biopipen/scripts/snp/PlinkFreq.R +298 -0
  57. biopipen/scripts/snp/PlinkFromVcf.py +78 -0
  58. biopipen/scripts/snp/PlinkHWE.R +80 -0
  59. biopipen/scripts/snp/PlinkHet.R +92 -0
  60. biopipen/scripts/snp/PlinkIBD.R +200 -0
  61. biopipen/scripts/snp/PlinkUpdateName.py +124 -0
  62. biopipen/scripts/stats/Mediation.R +94 -0
  63. biopipen/scripts/stats/MetaPvalue.R +2 -1
  64. biopipen/scripts/stats/MetaPvalue1.R +70 -0
  65. biopipen/scripts/tcr/TCRClusterStats.R +12 -7
  66. biopipen/scripts/vcf/BcftoolsAnnotate.py +91 -0
  67. biopipen/scripts/vcf/BcftoolsFilter.py +90 -0
  68. biopipen/scripts/vcf/BcftoolsSort.py +113 -0
  69. biopipen/scripts/vcf/BcftoolsView.py +73 -0
  70. biopipen/scripts/vcf/VcfFix_utils.py +1 -1
  71. biopipen/scripts/vcf/bcftools_utils.py +52 -0
  72. biopipen/utils/gene.R +83 -37
  73. biopipen/utils/gene.py +108 -60
  74. biopipen/utils/misc.R +56 -0
  75. biopipen/utils/misc.py +5 -2
  76. biopipen/utils/reference.py +54 -10
  77. {biopipen-0.28.1.dist-info → biopipen-0.29.1.dist-info}/METADATA +2 -2
  78. {biopipen-0.28.1.dist-info → biopipen-0.29.1.dist-info}/RECORD +80 -51
  79. {biopipen-0.28.1.dist-info → biopipen-0.29.1.dist-info}/entry_points.txt +1 -1
  80. biopipen/ns/bcftools.py +0 -111
  81. biopipen/scripts/bcftools/BcftoolsAnnotate.py +0 -42
  82. biopipen/scripts/bcftools/BcftoolsFilter.py +0 -79
  83. biopipen/scripts/bcftools/BcftoolsSort.py +0 -19
  84. biopipen/scripts/gene/GeneNameConversion.py +0 -66
  85. {biopipen-0.28.1.dist-info → biopipen-0.29.1.dist-info}/WHEEL +0 -0
@@ -65,6 +65,19 @@ if (ncores > 1) {
65
65
  log_info("- Reading Seurat object ...")
66
66
  srtobj <- readRDS(srtfile)
67
67
  defassay <- DefaultAssay(srtobj)
68
+ if (defassay == "SCT" && !"PrepSCTFindMarkers" %in% names(srtobj@commands)) {
69
+ log_warn(" SCTransform used but PrepSCTFindMarkers not applied, running ...")
70
+
71
+ srtobj <- PrepSCTFindMarkers(srtobj)
72
+ # compose a new SeuratCommand to record it to srtobj@commands
73
+ scommand <- srtobj@commands$FindClusters
74
+ scommand@name <- "PrepSCTFindMarkers"
75
+ scommand@time.stamp <- Sys.time()
76
+ scommand@assay.used <- "SCT"
77
+ scommand@call.string <- "PrepSCTFindMarkers(object = srtobj)"
78
+ scommand@params <- list()
79
+ srtobj@commands$PrepSCTFindMarkers <- scommand
80
+ }
68
81
 
69
82
  if (!is.null(mutaters) && length(mutaters) > 0) {
70
83
  log_info("- Mutating meta data ...")
@@ -411,45 +424,11 @@ do_case_findall <- function(casename) {
411
424
  log_info(" Using cached markers ...")
412
425
  markers <- cached$data
413
426
  } else {
414
- markers <- tryCatch({
415
- do_call(FindAllMarkers, args)
416
- # gene, p_val, avg_log2FC, pct.1, pct.2, p_val_adj, cluster
417
- }, error = function(e) {
418
- log_warn(e$message)
419
-
420
- data.frame(
421
- gene = character(),
422
- p_val = numeric(),
423
- avg_log2FC = numeric(),
424
- pct.1 = numeric(),
425
- pct.2 = numeric(),
426
- p_val_adj=numeric(),
427
- cluster = character()
428
- )
429
- })
427
+ markers <- find_markers(args, find_all = TRUE)
430
428
  cached$data <- markers
431
429
  save_to_cache(cached, "FindAllMarkers", cache)
432
430
  }
433
431
 
434
- if (nrow(markers) == 0 && defassay == "SCT") {
435
- log_warn(" No markers found from SCT assay, try recorrect_umi = FALSE")
436
- args$recorrect_umi <- FALSE
437
- markers <- tryCatch({
438
- do_call(FindAllMarkers, args)
439
- }, error = function(e) {
440
- log_warn(e$message)
441
- data.frame(
442
- gene = character(),
443
- p_val = numeric(),
444
- avg_log2FC = numeric(),
445
- pct.1 = numeric(),
446
- pct.2 = numeric(),
447
- p_val_adj=numeric(),
448
- cluster = character()
449
- )
450
- })
451
- }
452
-
453
432
  if (is.null(case$dotplot$assay)) {
454
433
  case$dotplot$assay <- case$assay
455
434
  }
@@ -483,6 +462,60 @@ do_case_findall <- function(casename) {
483
462
  }
484
463
  }
485
464
 
465
+ find_markers <- function(findmarkers_args, find_all = FALSE) {
466
+ if (find_all) {
467
+ fun <- FindAllMarkers
468
+ empty <- data.frame(
469
+ gene = character(),
470
+ p_val = numeric(),
471
+ avg_log2FC = numeric(),
472
+ pct.1 = numeric(),
473
+ pct.2 = numeric(),
474
+ p_val_adj = numeric(),
475
+ cluster = character()
476
+ )
477
+ } else {
478
+ fun <- FindMarkers
479
+ empty <- data.frame(
480
+ gene = character(),
481
+ p_val = numeric(),
482
+ avg_log2FC = numeric(),
483
+ pct.1 = numeric(),
484
+ pct.2 = numeric(),
485
+ p_val_adj = numeric()
486
+ )
487
+ }
488
+
489
+ call_findmarkers <- function(fn, args) {
490
+ if (find_all) {
491
+ do_call(fn, args)
492
+ } else {
493
+ do_call(fn, args) %>% rownames_to_column("gene")
494
+ }
495
+ }
496
+ markers <- tryCatch({
497
+ call_findmarkers(fun, findmarkers_args)
498
+ }, error = function(e) {
499
+ if (!grepl("PrepSCTFindMarkers", e$message) && defassay == "SCT") {
500
+ log_warn(paste0(" ! ", e$message))
501
+ }
502
+ empty
503
+ })
504
+
505
+ if (nrow(markers) == 0 && defassay == "SCT") {
506
+ log_warn(" ! No markers found from SCT assay, trying recorrect_umi = FALSE")
507
+ findmarkers_args$recorrect_umi <- FALSE
508
+ markers <- tryCatch({
509
+ call_findmarkers(fun, findmarkers_args)
510
+ }, error = function(e) {
511
+ log_warn(paste0(" ! ", e$message))
512
+ empty
513
+ })
514
+ }
515
+
516
+ markers
517
+ }
518
+
486
519
  sections <- c()
487
520
  do_case <- function(casename) {
488
521
  if (isTRUE(cases[[casename]]$findall)) {
@@ -538,38 +571,7 @@ do_case <- function(casename) {
538
571
  # args$min.cells.feature <- args$min.cells.feature %||% 1
539
572
  # args$min.pct <- args$min.pct %||% 0
540
573
 
541
- markers <- tryCatch({
542
- do_call(FindMarkers, args) %>% rownames_to_column("gene")
543
- }, error = function(e) {
544
- log_warn(paste0(" ", e$message))
545
- data.frame(
546
- gene = character(),
547
- p_val = numeric(),
548
- avg_log2FC = numeric(),
549
- pct.1 = numeric(),
550
- pct.2 = numeric(),
551
- p_val_adj = numeric()
552
- )
553
- })
554
-
555
- if (nrow(markers) == 0 && defassay == "SCT") {
556
- log_warn(" No markers found from SCT assay, trying recorrect_umi = FALSE")
557
- args$recorrect_umi <- FALSE
558
- markers <- tryCatch({
559
- do_call(FindMarkers, args) %>% rownames_to_column("gene")
560
- }, error = function(e) {
561
- log_warn(paste0(" ", e$message))
562
- data.frame(
563
- gene = character(),
564
- p_val = numeric(),
565
- avg_log2FC = numeric(),
566
- pct.1 = numeric(),
567
- pct.2 = numeric(),
568
- p_val_adj=numeric()
569
- )
570
- })
571
- }
572
-
574
+ markers <- find_markers(args)
573
575
  siggenes <- do_enrich(info, markers, case$sigmarkers, case$volcano_genes)
574
576
 
575
577
  if (length(siggenes) > 0) {
@@ -3,9 +3,11 @@ source("{{biopipen_dir}}/utils/caching.R")
3
3
 
4
4
  library(Seurat)
5
5
  library(future)
6
+ library(rlang)
6
7
  library(tidyr)
7
8
  library(dplyr)
8
9
  library(digest)
10
+ library(clustree)
9
11
 
10
12
  set.seed(8525)
11
13
 
@@ -129,45 +131,85 @@ if (is.null(cached$data)) {
129
131
  }
130
132
 
131
133
  envs$FindClusters$random.seed <- envs$FindClusters$random.seed %||% 8525
132
- resolution <- envs$FindClusters$resolution %||% 0.8
133
- if (is.character(resolution)) {
134
- if (grepl(",", resolution)) {
135
- resolution <- as.numeric(trimws(unlist(strsplit(resolution, ","))))
136
- } else {
137
- resolution <- as.numeric(resolution)
134
+ expand_resolution <- function(resolution) {
135
+ expanded_res <- c()
136
+ for (res in resolution) {
137
+ if (is.numeric(res)) {
138
+ expanded_res <- c(expanded_res, res)
139
+ } else {
140
+ # is.character
141
+ parts <- trimws(unlist(strsplit(res, ",")))
142
+ for (part in parts) {
143
+ if (grepl(":", part)) {
144
+ parts <- trimws(unlist(strsplit(part, ":")))
145
+ if (length(parts) == 2) { parts <- c(parts, 0.1) }
146
+ if (length(parts) != 3) {
147
+ stop("Invalid resolution format: {part}. Expected 2 or 3 parts separated by ':' for a range.")
148
+ }
149
+ parts <- as.numeric(parts)
150
+ expanded_res <- c(expanded_res, seq(parts[1], parts[2], by = parts[3]))
151
+ } else {
152
+ expanded_res <- c(expanded_res, as.numeric(part))
153
+ }
154
+ }
155
+ }
138
156
  }
157
+ # keep the last resolution at last
158
+ rev(unique(rev(expanded_res)))
139
159
  }
160
+ resolution <- envs$FindClusters$resolution <- expand_resolution(envs$FindClusters$resolution %||% 0.8)
161
+ log_info("Running FindClusters at resolution: {paste(resolution, collapse=',')} ...")
162
+
163
+ envs$FindClusters$object <- sobj
164
+ sobj <- do_call(FindClusters, envs$FindClusters)
140
165
 
166
+ # recode clusters from 0, 1, 2, ... to c1, c2, c3, ...
167
+ recode_clusters <- function(clusters) {
168
+ recode <- function(x) paste0("c", as.integer(as.character(x)) + 1)
169
+ clusters <- factor(recode(clusters), levels = recode(levels(clusters)))
170
+ clusters
171
+ }
172
+
173
+ graph_name <- envs$FindClusters$graph.name %||% paste0(DefaultAssay(sobj), "_snn_res.")
141
174
  for (res in resolution) {
142
- envs$FindClusters$resolution <- res
143
- cached <- get_cached(envs$FindClusters, paste0("FindClusters_", res), cache_dir)
144
- res_key <- paste0("seurat_clusters_", res)
145
- if (is.null(cached$data)) {
146
- log_info("Running FindClusters at resolution: {res} ...")
147
- envs$FindClusters$object <- sobj
148
- sobj <- do_call(FindClusters, envs$FindClusters)
149
- levels(sobj$seurat_clusters) <- paste0("c", as.numeric(levels(sobj$seurat_clusters)) + 1)
150
- sobj[[res_key]] <- sobj$seurat_clusters
151
- Idents(sobj) <- "seurat_clusters"
152
- cached$data <- list(clusters = sobj$seurat_clusters, commands = sobj@commands)
153
- save_to_cache(cached, paste0("FindClusters_", res), cache_dir)
154
- } else {
155
- log_info("Loading cached FindClusters at resolution: {res} ...")
156
- sobj@commands <- cached$data$commands
157
- sobj[[res_key]] <- cached$data$clusters
158
- sobj$seurat_clusters <- cached$data$clusters
159
- Idents(sobj) <- "seurat_clusters"
160
- }
161
- ident_table <- table(Idents(sobj))
162
- log_info("- Found {length(ident_table)} clusters")
163
- print(ident_table)
164
- cat("\n")
175
+ cluster_name <- paste0(graph_name, res)
176
+ new_cluster_name <- paste0("seurat_clusters.", res)
177
+ sobj@meta.data[[new_cluster_name]] <- recode_clusters(sobj@meta.data[[cluster_name]])
178
+ }
179
+ sobj@meta.data$seurat_clusters <- recode_clusters(sobj@meta.data$seurat_clusters)
180
+ Idents(sobj) <- "seurat_clusters"
181
+
182
+ ident_table <- table(Idents(sobj))
183
+ log_info("- Found {length(ident_table)} clusters at resolution {resolution[length(resolution)]}")
184
+ print(ident_table)
185
+ cat("\n")
186
+
187
+ # plot the tree
188
+ if (length(resolution) > 1) {
189
+ log_info("Plotting clustree ...")
190
+ png(
191
+ file.path(joboutdir, "clustree.png"),
192
+ res = envs$clustree_devpars$res,
193
+ width = envs$clustree_devpars$width,
194
+ height = envs$clustree_devpars$height
195
+ )
196
+ p <- clustree(sobj, prefix = "seurat_clusters.")
197
+ print(p)
198
+ dev.off()
165
199
  }
166
200
 
167
201
  if (DefaultAssay(sobj) == "SCT") {
168
202
  # https://github.com/satijalab/seurat/issues/6968
169
203
  log_info("Running PrepSCTFindMarkers ...")
170
204
  sobj <- PrepSCTFindMarkers(sobj)
205
+ # compose a new SeuratCommand to record it to sobj@commands
206
+ scommand <- sobj@commands$FindClusters
207
+ scommand@name <- "PrepSCTFindMarkers"
208
+ scommand@time.stamp <- Sys.time()
209
+ scommand@assay.used <- "SCT"
210
+ scommand@call.string <- "PrepSCTFindMarkers(object = sobj)"
211
+ scommand@params <- list()
212
+ sobj@commands$PrepSCTFindMarkers <- scommand
171
213
  }
172
214
 
173
215
  log_info("Saving results ...")
@@ -63,6 +63,26 @@ if (endsWith(ref, ".rds") || endsWith(ref, ".RDS")) {
63
63
  reference = LoadH5Seurat(ref)
64
64
  }
65
65
 
66
+ # check if refdata exists in the reference
67
+ for (rname in names(mapquery_args$refdata)) {
68
+ use_name <- mapquery_args$refdata[[rname]]
69
+ # transferring an assay
70
+ if (use_name %in% names(reference)) { next }
71
+ # transferring a metadata column
72
+ if (!use_name %in% colnames(reference@meta.data)) {
73
+ stop(paste0(
74
+ "The reference does not have the column '",
75
+ use_name,
76
+ "' in either assays or metadata. "
77
+ ))
78
+ if (startsWith(use_name, "predicted.")) {
79
+ stop(paste0(
80
+ "Do you mean: ", substring(use_name, 11),
81
+ ))
82
+ }
83
+ }
84
+ }
85
+
66
86
  if (refnorm == "auto" && DefaultAssay(reference) == "SCT") {
67
87
  refnorm = "SCTransform"
68
88
  }