biopipen 0.33.0__py3-none-any.whl → 0.34.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of biopipen might be problematic. Click here for more details.

Files changed (150) hide show
  1. biopipen/__init__.py +1 -1
  2. biopipen/core/filters.py +10 -183
  3. biopipen/core/proc.py +5 -3
  4. biopipen/core/testing.py +8 -1
  5. biopipen/ns/bam.py +40 -4
  6. biopipen/ns/cnv.py +1 -1
  7. biopipen/ns/cnvkit.py +1 -1
  8. biopipen/ns/delim.py +1 -1
  9. biopipen/ns/gsea.py +63 -37
  10. biopipen/ns/misc.py +38 -0
  11. biopipen/ns/plot.py +8 -0
  12. biopipen/ns/scrna.py +307 -288
  13. biopipen/ns/scrna_metabolic_landscape.py +207 -366
  14. biopipen/ns/tcr.py +165 -97
  15. biopipen/reports/bam/CNVpytor.svelte +4 -9
  16. biopipen/reports/cnvkit/CNVkitDiagram.svelte +1 -1
  17. biopipen/reports/cnvkit/CNVkitHeatmap.svelte +1 -1
  18. biopipen/reports/cnvkit/CNVkitScatter.svelte +1 -1
  19. biopipen/reports/{delim/SampleInfo.svelte → common.svelte} +2 -3
  20. biopipen/reports/scrna/DimPlots.svelte +1 -1
  21. biopipen/reports/scrna_metabolic_landscape/MetabolicFeatures.svelte +51 -22
  22. biopipen/reports/scrna_metabolic_landscape/MetabolicPathwayActivity.svelte +46 -42
  23. biopipen/reports/scrna_metabolic_landscape/MetabolicPathwayHeterogeneity.svelte +63 -6
  24. biopipen/reports/snp/PlinkCallRate.svelte +2 -2
  25. biopipen/reports/snp/PlinkFreq.svelte +1 -1
  26. biopipen/reports/snp/PlinkHWE.svelte +1 -1
  27. biopipen/reports/snp/PlinkHet.svelte +1 -1
  28. biopipen/reports/snp/PlinkIBD.svelte +1 -1
  29. biopipen/reports/tcr/CDR3AAPhyschem.svelte +1 -1
  30. biopipen/scripts/bam/CNAClinic.R +41 -6
  31. biopipen/scripts/bam/CNVpytor.py +2 -1
  32. biopipen/scripts/bam/ControlFREEC.py +2 -3
  33. biopipen/scripts/bam/SamtoolsView.py +33 -0
  34. biopipen/scripts/cnv/AneuploidyScore.R +25 -13
  35. biopipen/scripts/cnv/AneuploidyScoreSummary.R +218 -163
  36. biopipen/scripts/cnv/TMADScore.R +4 -4
  37. biopipen/scripts/cnv/TMADScoreSummary.R +51 -84
  38. biopipen/scripts/cnvkit/CNVkitGuessBaits.py +3 -3
  39. biopipen/scripts/cnvkit/CNVkitHeatmap.py +3 -3
  40. biopipen/scripts/cnvkit/CNVkitReference.py +3 -3
  41. biopipen/scripts/delim/RowsBinder.R +1 -1
  42. biopipen/scripts/delim/SampleInfo.R +14 -2
  43. biopipen/scripts/gene/GeneNameConversion.R +14 -12
  44. biopipen/scripts/gsea/Enrichr.R +2 -2
  45. biopipen/scripts/gsea/FGSEA.R +184 -50
  46. biopipen/scripts/gsea/PreRank.R +3 -3
  47. biopipen/scripts/misc/Plot.R +80 -0
  48. biopipen/scripts/plot/VennDiagram.R +2 -2
  49. biopipen/scripts/protein/ProdigySummary.R +34 -27
  50. biopipen/scripts/regulatory/MotifAffinityTest.R +11 -9
  51. biopipen/scripts/regulatory/MotifAffinityTest_AtSNP.R +5 -5
  52. biopipen/scripts/regulatory/MotifAffinityTest_MotifBreakR.R +4 -4
  53. biopipen/scripts/regulatory/VariantMotifPlot.R +10 -8
  54. biopipen/scripts/regulatory/motifs-common.R +10 -9
  55. biopipen/scripts/rnaseq/Simulation-ESCO.R +14 -11
  56. biopipen/scripts/rnaseq/Simulation-RUVcorr.R +7 -4
  57. biopipen/scripts/rnaseq/Simulation.R +0 -2
  58. biopipen/scripts/rnaseq/UnitConversion.R +6 -5
  59. biopipen/scripts/scrna/AnnData2Seurat.R +25 -73
  60. biopipen/scripts/scrna/CellCellCommunication.py +1 -1
  61. biopipen/scripts/scrna/CellCellCommunicationPlots.R +51 -168
  62. biopipen/scripts/scrna/CellTypeAnnotation-celltypist.R +99 -150
  63. biopipen/scripts/scrna/CellTypeAnnotation-direct.R +11 -9
  64. biopipen/scripts/scrna/CellTypeAnnotation-hitype.R +12 -9
  65. biopipen/scripts/scrna/CellTypeAnnotation-sccatch.R +14 -11
  66. biopipen/scripts/scrna/CellTypeAnnotation-sctype.R +19 -16
  67. biopipen/scripts/scrna/CellTypeAnnotation.R +10 -2
  68. biopipen/scripts/scrna/CellsDistribution.R +1 -1
  69. biopipen/scripts/scrna/ExprImputation-alra.R +87 -11
  70. biopipen/scripts/scrna/ExprImputation-rmagic.R +247 -21
  71. biopipen/scripts/scrna/ExprImputation-scimpute.R +8 -5
  72. biopipen/scripts/scrna/LoomTo10X.R +51 -0
  73. biopipen/scripts/scrna/MarkersFinder.R +348 -217
  74. biopipen/scripts/scrna/MetaMarkers.R +3 -3
  75. biopipen/scripts/scrna/ModuleScoreCalculator.R +14 -13
  76. biopipen/scripts/scrna/RadarPlots.R +1 -1
  77. biopipen/scripts/scrna/ScFGSEA.R +157 -75
  78. biopipen/scripts/scrna/ScSimulation.R +11 -10
  79. biopipen/scripts/scrna/ScVelo.py +605 -0
  80. biopipen/scripts/scrna/Seurat2AnnData.R +2 -3
  81. biopipen/scripts/scrna/SeuratClusterStats-clustree.R +1 -1
  82. biopipen/scripts/scrna/SeuratClusterStats-features.R +39 -30
  83. biopipen/scripts/scrna/SeuratClusterStats-ngenes.R +56 -65
  84. biopipen/scripts/scrna/SeuratClusterStats-stats.R +4 -4
  85. biopipen/scripts/scrna/SeuratClusterStats.R +9 -6
  86. biopipen/scripts/scrna/SeuratClustering.R +31 -48
  87. biopipen/scripts/scrna/SeuratLoading.R +2 -2
  88. biopipen/scripts/scrna/SeuratMap2Ref.R +66 -367
  89. biopipen/scripts/scrna/SeuratMetadataMutater.R +5 -7
  90. biopipen/scripts/scrna/SeuratPreparing.R +76 -24
  91. biopipen/scripts/scrna/SeuratSubClustering.R +46 -185
  92. biopipen/scripts/scrna/{SlingShot.R → Slingshot.R} +12 -16
  93. biopipen/scripts/scrna/Subset10X.R +2 -2
  94. biopipen/scripts/scrna/TopExpressingGenes.R +141 -184
  95. biopipen/scripts/scrna/celltypist-wrapper.py +6 -4
  96. biopipen/scripts/scrna/seurat_anndata_conversion.py +81 -0
  97. biopipen/scripts/scrna_metabolic_landscape/MetabolicFeatures.R +429 -123
  98. biopipen/scripts/scrna_metabolic_landscape/MetabolicPathwayActivity.R +346 -245
  99. biopipen/scripts/scrna_metabolic_landscape/MetabolicPathwayHeterogeneity.R +182 -173
  100. biopipen/scripts/snp/MatrixEQTL.R +39 -20
  101. biopipen/scripts/snp/PlinkCallRate.R +43 -34
  102. biopipen/scripts/snp/PlinkFreq.R +34 -41
  103. biopipen/scripts/snp/PlinkHWE.R +23 -18
  104. biopipen/scripts/snp/PlinkHet.R +26 -22
  105. biopipen/scripts/snp/PlinkIBD.R +30 -34
  106. biopipen/scripts/stats/ChowTest.R +9 -8
  107. biopipen/scripts/stats/DiffCoexpr.R +13 -11
  108. biopipen/scripts/stats/LiquidAssoc.R +7 -8
  109. biopipen/scripts/stats/Mediation.R +8 -8
  110. biopipen/scripts/stats/MetaPvalue.R +11 -13
  111. biopipen/scripts/stats/MetaPvalue1.R +6 -5
  112. biopipen/scripts/tcr/CDR3AAPhyschem.R +105 -164
  113. biopipen/scripts/tcr/ClonalStats.R +5 -4
  114. biopipen/scripts/tcr/CloneResidency.R +3 -3
  115. biopipen/scripts/tcr/CloneSizeQQPlot.R +2 -2
  116. biopipen/scripts/tcr/Immunarch2VDJtools.R +2 -2
  117. biopipen/scripts/tcr/ImmunarchFilter.R +3 -3
  118. biopipen/scripts/tcr/ImmunarchLoading.R +5 -5
  119. biopipen/scripts/tcr/ScRepCombiningExpression.R +39 -0
  120. biopipen/scripts/tcr/ScRepLoading.R +114 -92
  121. biopipen/scripts/tcr/TCRClusterStats.R +2 -2
  122. biopipen/scripts/tcr/TCRClustering.R +86 -97
  123. biopipen/scripts/tcr/TESSA.R +65 -115
  124. biopipen/scripts/tcr/VJUsage.R +5 -5
  125. biopipen/scripts/vcf/TruvariBenchSummary.R +15 -11
  126. biopipen/utils/common_docstrs.py +66 -63
  127. biopipen/utils/reporter.py +177 -0
  128. {biopipen-0.33.0.dist-info → biopipen-0.34.0.dist-info}/METADATA +2 -1
  129. {biopipen-0.33.0.dist-info → biopipen-0.34.0.dist-info}/RECORD +131 -144
  130. {biopipen-0.33.0.dist-info → biopipen-0.34.0.dist-info}/WHEEL +1 -1
  131. biopipen/reports/scrna/CellCellCommunicationPlots.svelte +0 -14
  132. biopipen/reports/scrna/SeuratClusterStats.svelte +0 -16
  133. biopipen/reports/scrna/SeuratMap2Ref.svelte +0 -37
  134. biopipen/reports/scrna/SeuratPreparing.svelte +0 -15
  135. biopipen/reports/scrna_metabolic_landscape/MetabolicFeaturesIntraSubset.svelte +0 -28
  136. biopipen/reports/utils/gsea.liq +0 -110
  137. biopipen/scripts/scrna/CellTypeAnnotation-common.R +0 -10
  138. biopipen/scripts/scrna/SeuratClustering-common.R +0 -213
  139. biopipen/scripts/scrna_metabolic_landscape/MetabolicFeaturesIntraSubset.R +0 -193
  140. biopipen/utils/caching.R +0 -44
  141. biopipen/utils/gene.R +0 -95
  142. biopipen/utils/gsea.R +0 -329
  143. biopipen/utils/io.R +0 -20
  144. biopipen/utils/misc.R +0 -602
  145. biopipen/utils/mutate_helpers.R +0 -581
  146. biopipen/utils/plot.R +0 -209
  147. biopipen/utils/repr.R +0 -146
  148. biopipen/utils/rnaseq.R +0 -48
  149. biopipen/utils/single_cell.R +0 -207
  150. {biopipen-0.33.0.dist-info → biopipen-0.34.0.dist-info}/entry_points.txt +0 -0
@@ -1,13 +1,10 @@
1
- {{ biopipen_dir | joinpaths: "utils", "misc.R" | source_r }}
2
- {{ biopipen_dir | joinpaths: "utils", "gsea.R" | source_r }}
3
- {{ biopipen_dir | joinpaths: "utils", "plot.R" | source_r }}
4
-
5
1
  library(gtools)
6
- library(parallel)
7
- library(ggprism)
2
+ library(rlang)
8
3
  library(Matrix)
9
4
  library(sparseMatrixStats)
10
5
  library(Seurat)
6
+ library(tidyseurat)
7
+ library(biopipen.utils)
11
8
 
12
9
  sobjfile <- {{ in.sobjfile | r }}
13
10
  outdir <- {{ out.outdir | r }}
@@ -16,53 +13,81 @@ gmtfile <- {{ envs.gmtfile | r }}
16
13
  select_pcs <- {{ envs.select_pcs | r }}
17
14
  ncores <- {{ envs.ncores | r }}
18
15
  pathway_pval_cutoff <- {{ envs.pathway_pval_cutoff | r }}
19
- bubble_devpars <- {{ envs.bubble_devpars | r }}
20
- grouping <- {{ envs.grouping | r }}
21
- grouping_prefix <- {{ envs.grouping_prefix | r }}
22
- subsetting_cols <- {{ envs.subsetting | r }}
23
- subsetting_prefix <- {{ envs.subsetting_prefix | r }}
24
-
25
- if (!is.null(grouping_prefix) && nchar(grouping_prefix) > 0) {
26
- grouping_prefix = paste0(grouping_prefix, "_")
27
- }
28
-
29
- if (!is.null(subsetting_prefix) && nchar(subsetting_prefix) > 0) {
30
- subsetting_prefix = paste0(subsetting_prefix, "_")
31
- }
16
+ subset_by <- {{ envs.subset_by | r }}
17
+ group_by <- {{ envs.group_by | r }}
18
+ fgsea_args <- {{ envs.fgsea_args | r }}
19
+ plots <- {{ envs.plots | r }}
20
+ cases <- {{ envs.cases | r }}
32
21
 
33
22
  set.seed(8525)
34
23
 
35
- ## gmt_pathways is copied from fgsea package.
36
- gmt_pathways <- function(gmt_file) {
37
- pathway_lines <- strsplit(readLines(gmt_file), "\t")
38
- pathways <- lapply(pathway_lines, tail, -2)
39
- names(pathways) <- sapply(pathway_lines, head, 1)
40
- pathways
41
- }
24
+ log <- get_logger()
25
+ reporter <- get_reporter()
26
+
27
+ log$info("Loading Seurat object ...")
28
+ sobj <- read_obj(sobjfile)
42
29
 
43
- gmtfile <- localizeGmtfile(gmtfile)
44
- pathways <- gmt_pathways(gmtfile)
30
+ defaults <- list(
31
+ subset_by = subset_by,
32
+ group_by = group_by,
33
+ fgsea_args = fgsea_args,
34
+ plots = plots,
35
+ select_pcs = select_pcs,
36
+ pathway_pval_cutoff = pathway_pval_cutoff
37
+ )
38
+ log$info("Expanding cases ...")
39
+ default_case <- subset_by %||% "DEFAULT"
40
+ cases <- expand_cases(
41
+ cases,
42
+ defaults,
43
+ function(name, case) {
44
+ if (is.null(case$group_by)) {
45
+ stop("'group_by' is required in case: ", name)
46
+ }
47
+ stats::setNames(list(case), name)
48
+ },
49
+ default_case = default_case)
50
+
51
+ log$info("Loading metabolic pathways ...")
52
+ pathways <- ParseGMT(gmtfile)
53
+ pathway_names <- names(pathways)
45
54
  metabolics <- unique(as.vector(unname(unlist(pathways))))
46
- sobj <- readRDS(sobjfile)
47
55
 
48
56
 
49
- do_one_subset <- function(s, subset_col, subset_prefix) {
50
- log_info(paste0(" Handling subset value: ", s, " ..."))
51
- if (is.null(s)) {
52
- subset_dir = file.path(outdir, "ALL")
53
- subset_obj = sobj
57
+ do_subset <- function(object, caseinfo, subset_by, subset_val, group_by, plots, select_pcs, pathway_pval_cutoff) {
58
+ if (!is.null(subset_by)) {
59
+ log$info("- Handling subset: {subset_by} = {subset_val} ...")
60
+ object <- tryCatch(
61
+ filter(object, !!sym(subset_by) == subset_val & !is.na(!!sym(group_by))),
62
+ error = function(e) NULL
63
+ )
64
+ }
65
+ if (!is.null(subset_by)) {
66
+ h1 <- paste0(subset_by, ": ", subset_val)
67
+ h2 <- group_by
68
+ odir <- file.path(caseinfo$prefix, slugify(paste0(subset_by, "_", subset_val)))
69
+ } else if (length(cases) > 1) {
70
+ h1 <- "No Subsetting"
71
+ h2 <- group_by
72
+ odir <- file.path(caseinfo$prefix, "No_Subsetting")
54
73
  } else {
55
- subset_dir = file.path(outdir, slugify(paste0(subset_prefix, s)))
56
- subset_code = paste0("subset(sobj, subset = ", subset_col, " == '", s, "')")
57
- subset_obj = eval(parse(text = subset_code))
74
+ h1 <- group_by
75
+ h2 <- "#"
76
+ odir <- caseinfo$prefix
77
+ }
78
+ if (is.null(object) || ncol(object) < 5) {
79
+ msg <- paste0(" ! skipped. Subset has less than 5 cells: ", subset_by, " = ", subset_val)
80
+ log$warn(msg)
81
+ reporter$add(list(kind = "error", content = msg), h1 = h1, h2 = h2)
82
+ return(NULL)
58
83
  }
59
- dir.create(subset_dir, showWarnings = FALSE)
60
84
 
61
- features = intersect(rownames(subset_obj), metabolics)
62
- all_groups = as.character(subset_obj@meta.data[[grouping]])
63
- groups <- unique(all_groups)
85
+ dir.create(odir, showWarnings = FALSE)
64
86
 
65
- enrich_data_df <- data.frame(x = NULL, y = NULL, NES = NULL, PVAL = NULL)
87
+ features <- intersect(rownames(object), metabolics)
88
+ groups <- unique(as.character(object@meta.data[[group_by]]))
89
+
90
+ enrich_data_df <- NULL
66
91
  pc_plotdata <- data.frame(
67
92
  x = numeric(),
68
93
  y = numeric(),
@@ -71,11 +96,18 @@ do_one_subset <- function(s, subset_col, subset_prefix) {
71
96
  )
72
97
 
73
98
  for (group in groups) {
74
- group_code = paste0("subset(subset_obj, subset = ", grouping, " == '", group, "')")
75
- each_metabolic_obj <- eval(parse(text = group_code))
76
- each_metabolic_exprs <- GetAssayData(each_metabolic_obj)[features, , drop=F]
77
- each_metabolic_exprs <- each_metabolic_exprs[rowSums(each_metabolic_exprs) > 0, , drop=F]
78
- if (ncol(each_metabolic_exprs) == 1) { next }
99
+ log$info(" {group_by}: {group} ...")
100
+ each_metabolic_obj <- subset(object, subset = !!sym(group_by) == group)
101
+ if (ncol(each_metabolic_obj) < 5) {
102
+ log$warn(" ! skipped. Group has less than 5 cells: {group}")
103
+ next()
104
+ }
105
+ each_metabolic_exprs <- GetAssayData(each_metabolic_obj)[features, , drop = FALSE]
106
+ each_metabolic_exprs <- each_metabolic_exprs[rowSums(each_metabolic_exprs) > 0, , drop=FALSE]
107
+ if (ncol(each_metabolic_obj) < 5) {
108
+ log$warn(" ! skipped. Group has less than 5 active cells: {group}")
109
+ next()
110
+ }
79
111
  x <- each_metabolic_exprs
80
112
  ntop <- nrow(x)
81
113
  rv <- rowVars(x)
@@ -97,158 +129,135 @@ do_one_subset <- function(s, subset_col, subset_prefix) {
97
129
  pc_plotdata <- rbind(pc_plotdata, tmp_plotdata)
98
130
 
99
131
  ####
100
- pre_rank_matrix <- as.matrix(rowSums(abs(pca$rotation[, 1:selected_pcs, drop=FALSE])))
101
- pre_rank_matrix <- as.list(as.data.frame(t(pre_rank_matrix)))
102
-
103
- odir = file.path(subset_dir, paste0(grouping_prefix, slugify(group)))
104
- dir.create(odir, showWarnings = FALSE)
105
- runFGSEA(
106
- pre_rank_matrix,
107
- gmtfile = gmtfile,
108
- top = 100,
109
- outdir = odir,
110
- plot = FALSE,
111
- envs = list(scoreType = "std", nproc=1)
112
- )
113
- ############ Motify this
114
- result_file = file.path(odir, "fgsea.txt")
115
- gsea_result = read.table(result_file, header=T, row.names = NULL, sep="\t", check.names=F)
116
- # get the result
117
- enrich_data_df <- rbind(
118
- enrich_data_df,
119
- data.frame(x = group, y = gsea_result$pathway, NES = gsea_result$NES, PVAL = gsea_result$pval)
120
- )
132
+ pre_rank_matrix <- as.matrix(rowSums(abs(pca$rotation[, 1:selected_pcs, drop = FALSE])))
133
+ pre_rank_matrix <- unlist(as.list(as.data.frame(t(pre_rank_matrix))))
134
+
135
+ fgsea_args <- fgsea_args %||% list()
136
+ fgsea_args$ranks <- pre_rank_matrix
137
+ fgsea_args$genesets <- pathways
138
+ fgsea_args$nproc <- fgsea_args$nproc %||% ncores
139
+
140
+ tmp <- do_call(RunGSEA, fgsea_args)
141
+ tmp[[group_by]] <- group
142
+
143
+ if (is.null(enrich_data_df)) {
144
+ enrich_data_df <- tmp
145
+ } else {
146
+ enrich_data_df <- rbind(enrich_data_df, tmp)
147
+ }
121
148
  }
122
149
 
123
150
  # remove pvalue < 0.01 pathways
124
- min_pval <- by(enrich_data_df$PVAL, enrich_data_df$y, FUN = min)
151
+ min_pval <- by(enrich_data_df$pval, enrich_data_df$pathway, FUN = min)
125
152
  select_pathways <- names(min_pval)[(min_pval <= pathway_pval_cutoff)]
126
- select_enrich_data_df <- enrich_data_df[enrich_data_df$y %in% select_pathways, ]
153
+ select_enrich_data_df <- enrich_data_df[enrich_data_df$pathway %in% select_pathways, ]
127
154
  # converto pvalue to -log10
128
- pvals <- select_enrich_data_df$PVAL
155
+ pvals <- select_enrich_data_df$pval
129
156
  pvals[pvals <= 0] <- 1e-10
130
- select_enrich_data_df$PVAL <- -log10(pvals)
157
+ select_enrich_data_df$pval <- -log10(pvals)
131
158
 
132
159
  # sort
133
- pathway_pv_sum <- by(select_enrich_data_df$PVAL, select_enrich_data_df$y, FUN = sum)
160
+ pathway_pv_sum <- by(select_enrich_data_df$pval, select_enrich_data_df$pathway, FUN = sum)
134
161
  pathway_order <- names(pathway_pv_sum)[order(pathway_pv_sum, decreasing = T)]
135
162
  ########################### top 10
136
163
  pathway_order <- pathway_order[1:10]
137
- select_enrich_data_df <- select_enrich_data_df[select_enrich_data_df$y %in% pathway_order, ]
164
+ select_enrich_data_df <- select_enrich_data_df[select_enrich_data_df$pathway %in% pathway_order, ]
138
165
  ########################################
139
- select_enrich_data_df$x <- factor(select_enrich_data_df$x, levels = mixedsort(groups))
140
- select_enrich_data_df$y <- factor(select_enrich_data_df$y, levels = pathway_order)
141
-
142
- ## buble plot
143
- select_enrich_data_df$x = sapply(select_enrich_data_df$x, function(x) { paste0(grouping_prefix, x) })
144
- bubblefile = file.path(subset_dir, "pathway_heterogeneity.png")
145
- bub_devpars = list() # bubble_devpars
146
- if (is.null(bub_devpars$res)) {
147
- bub_devpars$res = 100
148
- }
149
- if (is.null(bub_devpars$width)) {
150
- bub_devpars$width = 300 +
151
- max(nchar(as.character(select_enrich_data_df$y))) * 8 +
152
- length(unique(select_enrich_data_df$x)) * 25
153
- }
154
- if (is.null(bub_devpars$height)) {
155
- bub_devpars$height = 400 +
156
- max(nchar(unique(select_enrich_data_df$x))) * 8 +
157
- length(unique(select_enrich_data_df$y)) * 25
158
- }
159
- bub_devpars$height = max(bub_devpars$height, 480)
160
- # For debug purposes
166
+ select_enrich_data_df[[group_by]] <- factor(select_enrich_data_df[[group_by]], levels = gtools::mixedsort(groups))
167
+ select_enrich_data_df$pathway <- factor(select_enrich_data_df$pathway, levels = pathway_order)
168
+
161
169
  write.table(
162
- select_enrich_data_df,
163
- file.path(subset_dir, "pathway_heterogeneity.txt"),
164
- sep="\t",
165
- quote=F,
166
- row.names=F
170
+ as.data.frame(select_enrich_data_df),
171
+ file = file.path(odir, "pathway_heterogeneity.txt"),
172
+ sep = "\t",
173
+ quote = FALSE,
174
+ row.names = FALSE
167
175
  )
168
- if (nrow(select_enrich_data_df) == 0) {
169
- p = ggplot(data.frame(text = "No significant pathways found")) +
170
- geom_text(aes(x = 0, y = 0, label = text), size = 10) +
171
- theme_void() +
172
- theme(
173
- plot.margin = unit(c(0, 0, 0, 0), "cm"),
174
- plot.background = element_rect(fill = "white", colour = NA)
175
- )
176
- png(bubblefile, width = 600, height = 100, res = 70)
176
+
177
+ for (plot in names(plots)) {
178
+ plotargs <- plots[[plot]]
179
+ plotargs$devpars <- plotargs$devpars %||% list()
180
+ plotargs$devpars$res <- plotargs$devpars$res %||% 100
181
+
182
+ if (plotargs$plot_type == "dot") {
183
+ plotargs$x <- plotargs$x %||% group_by
184
+ plotargs$y <- plotargs$y %||% "pathway"
185
+ plotargs$fill_by <- plotargs$fill_by %||% "NES"
186
+ plotargs$size_by <- plotargs$size_by %||% "pval"
187
+ plotargs$add_bg <- plotargs$add_bg %||% TRUE
188
+ plotargs$x_text_angle <- plotargs$x_text_angle %||% 90
189
+ plotfn <- plotthis::DotPlot
190
+ } else {
191
+ stop("Unknown plot type: ", plotargs$plot_type)
192
+ }
193
+
194
+ p <- do_call(plotfn, c(list(select_enrich_data_df), plotargs))
195
+ plotprefix <- file.path(odir, slugify(plot))
196
+ plotargs$devpars$width <- plotargs$devpars$width %||% (attr(p, "width") * plotargs$devpars$res) %||% 800
197
+ plotargs$devpars$height <- plotargs$devpars$height %||% (attr(p, "height") * plotargs$devpars$res) %||% 600
198
+ png(
199
+ filename = paste0(plotprefix, ".png"),
200
+ width = plotargs$devpars$width,
201
+ height = plotargs$devpars$height,
202
+ res = plotargs$devpars$res
203
+ )
177
204
  print(p)
178
205
  dev.off()
179
- } else {
180
- plotGG(
181
- select_enrich_data_df,
182
- "point",
183
- args = list(aes(x=x, y=y, size=PVAL, color=NES), shape=19),
184
- ggs = c(
185
- 'scale_size(range = c(2, 10))',
186
- 'scale_color_gradient(low = "white", high = "red")',
187
- 'labs(
188
- x = NULL, y = NULL, color="NES", size="-log10(pval)"
189
- )',
190
- 'theme_prism(axis_text_angle = 90)',
191
- 'theme(legend.title = element_text())'
206
+
207
+ reporter$add(
208
+ list(
209
+ name = plot,
210
+ contents = list(
211
+ list(kind = "descr", content = plotargs$descr %||% plot),
212
+ reporter$image(plotprefix, c(), FALSE, kind = "image")
213
+ )
192
214
  ),
193
- devpars = bub_devpars,
194
- outfile = bubblefile
215
+ h1 = h1,
216
+ h2 = h2,
217
+ ui = "tabs"
195
218
  )
196
219
  }
220
+ }
197
221
 
198
- ## plot variance
199
- pc_plotdata$group <- factor(pc_plotdata$group, levels = mixedsort(groups))
200
- p <- ggplot(pc_plotdata) +
201
- geom_point(aes(x, y, colour = factor(sel)), size = 0.5) +
202
- scale_color_manual(values = c("gray", "#ff4000")) +
203
- facet_wrap(~group, scales = "free", ncol = 4) +
204
- theme_bw() +
205
- labs(x = "Principal components", y = "Explained variance (%)") +
206
- theme(
207
- legend.position = "none", panel.grid.major = element_blank(),
208
- panel.grid.minor = element_blank(),
209
- axis.line = element_line(linewidth = 0.2, colour = "black"),
210
- axis.ticks = element_line(colour = "black", linewidth = 0.2),
211
- axis.text.x = element_text(colour = "black", size = 6),
212
- axis.text.y = element_text(colour = "black", size = 6),
213
- strip.background = element_rect(fill = "white", linewidth = 0.2, colour = NULL),
214
- strip.text = element_text(size = 6)
215
- )
216
-
217
- ggsave(file.path(subset_dir, "PC_variance_plot.pdf"), p, device = "pdf", useDingbats = FALSE)
218
222
 
219
- list(
220
- list(kind = "descr", content = "Metabolic pathways enriched in genes with highest contribution to the metabolic heterogeneities"),
221
- list(kind = "image", src = bubblefile),
222
- h1 = ifelse(is.null(s), "Metabolic pathway heterogeneity", paste0(subset_prefix, s))
223
- )
224
- }
223
+ do_case <- function(casename) {
224
+ log$info("Processing case: {casename} ...")
225
+ case <- cases[[casename]]
226
+ caseinfo <- case_info(casename, outdir, create = TRUE)
225
227
 
226
- do_one_subset_col <- function(subset_col, subset_prefix) {
227
- log_info(paste0("- Handling subset column: ", subset_col, " ..."))
228
- if (is.null(subset_col)) {
229
- x <- do_one_subset(NULL, subset_col = NULL, subset_prefix = NULL)
230
- do.call(add_report, x)
228
+ if (is.null(case$subset_by)) {
229
+ result <- do_subset(
230
+ sobj,
231
+ caseinfo = caseinfo,
232
+ subset_by = NULL,
233
+ subset_val = NULL,
234
+ group_by = case$group_by,
235
+ plots = case$plots,
236
+ select_pcs = case$select_pcs,
237
+ pathway_pval_cutoff = case$pathway_pval_cutoff
238
+ )
231
239
  } else {
232
- subsets <- na.omit(unique(sobj@meta.data[[subset_col]]))
240
+ sobj_avail <- filter(sobj, !is.na(!!sym(case$subset_by)))
241
+ subsets <- unique(sobj@meta.data[[case$subset_by]])
233
242
 
234
- if (ncores == 1) {
235
- x = lapply(subsets, do_one_subset, subset_col = subset_col, subset_prefix = subset_prefix)
236
- } else {
237
- x <- mclapply(subsets, do_one_subset, subset_col = subset_col, subset_prefix = subset_prefix, mc.cores = ncores)
238
- if (any(unlist(lapply(x, class)) == "try-error")) {
239
- stop(paste0("\nmclapply error:", x))
243
+ lapply(
244
+ subsets,
245
+ function(ss) {
246
+ do_subset(
247
+ sobj_avail,
248
+ caseinfo = caseinfo,
249
+ subset_by = case$subset_by,
250
+ subset_val = ss,
251
+ group_by = case$group_by,
252
+ plots = case$plots,
253
+ select_pcs = case$select_pcs,
254
+ pathway_pval_cutoff = case$pathway_pval_cutoff
255
+ )
240
256
  }
241
- }
242
- for (r in x) { do.call(add_report, r) }
257
+ )
243
258
  }
244
259
  }
245
260
 
246
- if (is.null(subsetting_cols)) {
247
- do_one_subset_col(NULL)
248
- } else {
249
- for (i in seq_along(subsetting_cols)) {
250
- do_one_subset_col(subsetting_cols[i], subsetting_prefix[i])
251
- }
252
- }
261
+ sapply(names(cases), do_case)
253
262
 
254
- save_report(joboutdir)
263
+ reporter$save(dirname(outdir))
@@ -1,7 +1,7 @@
1
- {{ biopipen_dir | joinpaths: "utils", "misc.R" | source_r }}
2
1
  library(rlang)
3
2
  library(rtracklayer)
4
3
  library(MatrixEQTL)
4
+ library(biopipen.utils)
5
5
 
6
6
  snpfile = {{in.geno | r}}
7
7
  expfile = {{in.expr | r}}
@@ -23,6 +23,8 @@ transpose_geno = {{envs.transpose_geno | r}}
23
23
  transpose_expr = {{envs.transpose_expr | r}}
24
24
  transpose_cov = {{envs.transpose_cov | r}}
25
25
 
26
+ log <- get_logger()
27
+
26
28
  arg_match(model, c("modelANOVA", "modelLINEAR", "linear", "anova"))
27
29
  if (model == "linear") model = "modelLINEAR"
28
30
  if (model == "anova") model = "modelANOVA"
@@ -33,14 +35,14 @@ cis_enabled = !is.null(snppos) && !is.null(genepos) && dist > 0
33
35
 
34
36
  # if trans is disabled, all files needed for cis should be provided
35
37
  if (!trans_enabled && !cis_enabled) {
36
- log_warn("Using `envs.transp = 1e-5` since cis-eQTL is disabled.")
38
+ log$warn("Using `envs.transp = 1e-5` since cis-eQTL is disabled.")
37
39
  trans_enabled <- TRUE
38
40
  transp <- 1e-5
39
41
  }
40
42
 
41
43
  transpose_file <- function(file, what) {
42
44
  if (is.null(file)) return(NULL)
43
- log_info("Transposing {what} file ...")
45
+ log$info("Transposing {what} file ...")
44
46
  out <- file.path(joboutdir, paste0(
45
47
  tools::file_path_sans_ext(basename(file)),
46
48
  ".transposed.",
@@ -55,7 +57,7 @@ if (transpose_geno) snpfile = transpose_file(snpfile, "geno")
55
57
  if (transpose_expr) expfile = transpose_file(expfile, "expr")
56
58
  if (transpose_cov) covfile = transpose_file(covfile, "cov")
57
59
 
58
- log_info("Loading SNP data ...")
60
+ log$info("Loading SNP data ...")
59
61
  snps = SlicedData$new();
60
62
  snps$fileDelimiter = "\t"; # the TAB character
61
63
  snps$fileOmitCharacters = "NA"; # denote missing values;
@@ -64,7 +66,7 @@ snps$fileSkipColumns = 1; # one column of row labels
64
66
  snps$fileSliceSize = 10000; # read file in pieces of 2,000 rows
65
67
  snps$LoadFile( snpfile );
66
68
 
67
- log_info("Loading gene expression data ...")
69
+ log$info("Loading gene expression data ...")
68
70
  gene = SlicedData$new();
69
71
  gene$fileDelimiter = "\t"; # the TAB character
70
72
  gene$fileOmitCharacters = "NA"; # denote missing values;
@@ -75,12 +77,12 @@ gene$LoadFile( expfile );
75
77
 
76
78
  cvrt = SlicedData$new();
77
79
  if (!is.null(covfile) && file.exists(covfile)) {
78
- log_info("Loading covariate data ...")
80
+ log$info("Loading covariate data ...")
79
81
  covmatrix = read.table(covfile, header=TRUE, stringsAsFactors=FALSE, row.names=1, sep="\t", quote="", check.names=FALSE)
80
82
  cvrt$CreateFromMatrix( as.matrix(covmatrix) )
81
83
  }
82
84
 
83
- log_info("Matching samples ...")
85
+ log$info("Matching samples ...")
84
86
  if (match_samples) {
85
87
  # let matrixEQTL raise an error if samples do not match
86
88
  } else {
@@ -94,14 +96,14 @@ if (match_samples) {
94
96
  }
95
97
  snps = snps$ColumnSubsample(match(common_samples, snps$columnNames))
96
98
  gene = gene$ColumnSubsample(match(common_samples, gene$columnNames))
97
- log_info("- Samples used in SNP data: {n_sample_snps} -> {snps$nCols()}")
98
- log_info("- Samples used in gene expression data: {n_sample_gene} -> {gene$nCols()}")
99
+ log$info("- Samples used in SNP data: {n_sample_snps} -> {snps$nCols()}")
100
+ log$info("- Samples used in gene expression data: {n_sample_gene} -> {gene$nCols()}")
99
101
  if (!is.null(covfile)) {
100
- log_info("- Samples used in covariate data: {n_sample_cov} -> {cvrt$nCols()}")
102
+ log$info("- Samples used in covariate data: {n_sample_cov} -> {cvrt$nCols()}")
101
103
  }
102
104
  }
103
105
 
104
- log_info("Composing engine parameters ...")
106
+ log$info("Composing engine parameters ...")
105
107
  engine_params = list()
106
108
  engine_params$snps = snps
107
109
  engine_params$gene = gene
@@ -118,7 +120,7 @@ noq = function(s) {
118
120
  }
119
121
 
120
122
  if (cis_enabled) {
121
- log_info("Loading SNP positions ...")
123
+ log$info("Loading SNP positions ...")
122
124
  if (endsWith(snppos, ".bed")) {
123
125
  snppos_data = read.table(snppos, header = FALSE, stringsAsFactors = FALSE, sep = "\t")
124
126
  snppos_data = data.frame(
@@ -145,17 +147,25 @@ if (cis_enabled) {
145
147
  snppos_data = snppos_data[, c(3, 1, 2)]
146
148
  colnames(snppos_data) = c("snp", "chr", "pos")
147
149
  } else {
150
+ # snp chr pos
151
+ # Snp_01 chr1 721289
152
+ # Snp_02 chr1 752565
153
+ # check if 3rd column of the first line is numeric.
154
+ # if it is, there is no header; otherwise, it is a header.
155
+ header <- is.na(suppressWarnings(as.numeric(strsplit(readLines(snppos, n = 1), "\t")[[1]][3])))
156
+
148
157
  snppos_data = read.table(
149
158
  snppos,
150
- header=FALSE,
151
- row.names=NULL,
152
- stringsAsFactors=FALSE,
153
- check.names=FALSE
159
+ sep = "\t",
160
+ header = header,
161
+ row.names = NULL,
162
+ stringsAsFactors = FALSE,
163
+ check.names = FALSE
154
164
  )
155
165
  colnames(snppos_data) = c("snp", "chr", "pos")
156
166
  }
157
167
 
158
- log_info("Loading gene positions ...")
168
+ log$info("Loading gene positions ...")
159
169
  if (endsWith(genepos, ".bed")) {
160
170
  genepos_data = read.table(genepos, header = FALSE, stringsAsFactors = FALSE, sep = "\t")
161
171
  genepos_data = data.frame(
@@ -174,11 +184,20 @@ if (cis_enabled) {
174
184
  s2 = end(genepos_data)
175
185
  )
176
186
  } else {
177
- genepos_data = read.table(genepos, header = TRUE, stringsAsFactors = FALSE);
187
+ parts <- strsplit(readLines(genepos, n = 1), "\t")[[1]]
188
+ header <- is.na(suppressWarnings(as.numeric(parts[3]))) || is.na(suppressWarnings(as.numeric(parts[4])))
189
+ genepos_data = read.table(
190
+ genepos,
191
+ sep = "\t",
192
+ header = header,
193
+ row.names = NULL,
194
+ stringsAsFactors = FALSE,
195
+ check.names = FALSE
196
+ )
178
197
  colnames(genepos_data) = c("geneid", "chr", "s1", "s2")
179
198
  }
180
199
 
181
- log_info("Running MatrixEQTL with cis-eQTLs enabled ...")
200
+ log$info("Running MatrixEQTL with cis-eQTLs enabled ...")
182
201
  engine_params$output_file_name.cis = outfile
183
202
  engine_params$pvOutputThreshold.cis = min(pval, 1)
184
203
  engine_params$cisDist = dist
@@ -187,7 +206,7 @@ if (cis_enabled) {
187
206
  do_call(Matrix_eQTL_main, engine_params)
188
207
  if (!file.exists(alleqtl)) file.create(alleqtl)
189
208
  } else {
190
- log_info("Running MatrixEQTL without cis-eQTLs ...")
209
+ log$info("Running MatrixEQTL without cis-eQTLs ...")
191
210
  do_call(Matrix_eQTL_engine, engine_params)
192
211
  if (!file.exists(outfile)) file.create(outfile)
193
212
  }