biopipen 0.28.0__py3-none-any.whl → 0.29.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of biopipen might be problematic. Click here for more details.

Files changed (83) hide show
  1. biopipen/__init__.py +1 -1
  2. biopipen/core/config.toml +8 -0
  3. biopipen/ns/bam.py +0 -2
  4. biopipen/ns/bed.py +35 -0
  5. biopipen/ns/cellranger_pipeline.py +5 -5
  6. biopipen/ns/cnv.py +18 -2
  7. biopipen/ns/cnvkit_pipeline.py +16 -11
  8. biopipen/ns/gene.py +68 -23
  9. biopipen/ns/misc.py +2 -15
  10. biopipen/ns/plot.py +146 -0
  11. biopipen/ns/regulation.py +214 -0
  12. biopipen/ns/scrna.py +15 -3
  13. biopipen/ns/snp.py +516 -8
  14. biopipen/ns/stats.py +74 -2
  15. biopipen/ns/vcf.py +196 -0
  16. biopipen/reports/snp/PlinkCallRate.svelte +24 -0
  17. biopipen/reports/snp/PlinkFreq.svelte +18 -0
  18. biopipen/reports/snp/PlinkHWE.svelte +18 -0
  19. biopipen/reports/snp/PlinkHet.svelte +18 -0
  20. biopipen/reports/snp/PlinkIBD.svelte +18 -0
  21. biopipen/scripts/bam/CNVpytor.py +144 -46
  22. biopipen/scripts/bed/BedtoolsIntersect.py +54 -0
  23. biopipen/scripts/bed/BedtoolsMerge.py +1 -1
  24. biopipen/scripts/cnv/AneuploidyScore.R +30 -7
  25. biopipen/scripts/cnv/AneuploidyScoreSummary.R +5 -2
  26. biopipen/scripts/cnv/TMADScore.R +21 -5
  27. biopipen/scripts/cnv/TMADScoreSummary.R +6 -2
  28. biopipen/scripts/cnvkit/CNVkitAccess.py +2 -1
  29. biopipen/scripts/cnvkit/CNVkitAutobin.py +3 -2
  30. biopipen/scripts/cnvkit/CNVkitBatch.py +1 -1
  31. biopipen/scripts/cnvkit/CNVkitCoverage.py +2 -1
  32. biopipen/scripts/cnvkit/CNVkitGuessBaits.py +1 -1
  33. biopipen/scripts/cnvkit/CNVkitHeatmap.py +1 -1
  34. biopipen/scripts/cnvkit/CNVkitReference.py +2 -1
  35. biopipen/scripts/gene/GeneNameConversion.R +65 -0
  36. biopipen/scripts/gene/GenePromoters.R +61 -0
  37. biopipen/scripts/misc/Shell.sh +15 -0
  38. biopipen/scripts/plot/Manhattan.R +140 -0
  39. biopipen/scripts/plot/QQPlot.R +62 -0
  40. biopipen/scripts/regulation/MotifAffinityTest.R +226 -0
  41. biopipen/scripts/regulation/MotifAffinityTest_AtSNP.R +126 -0
  42. biopipen/scripts/regulation/MotifAffinityTest_MotifBreakR.R +96 -0
  43. biopipen/scripts/regulation/MotifScan.py +159 -0
  44. biopipen/scripts/regulation/atSNP.R +33 -0
  45. biopipen/scripts/regulation/motifBreakR.R +1594 -0
  46. biopipen/scripts/scrna/CellsDistribution.R +2 -0
  47. biopipen/scripts/scrna/MarkersFinder.R +59 -67
  48. biopipen/scripts/scrna/SeuratClustering.R +63 -29
  49. biopipen/scripts/scrna/SeuratMap2Ref.R +20 -0
  50. biopipen/scripts/scrna/SeuratSubClustering.R +76 -27
  51. biopipen/scripts/snp/MatrixEQTL.R +84 -43
  52. biopipen/scripts/snp/Plink2GTMat.py +133 -0
  53. biopipen/scripts/snp/PlinkCallRate.R +190 -0
  54. biopipen/scripts/snp/PlinkFilter.py +100 -0
  55. biopipen/scripts/snp/PlinkFreq.R +298 -0
  56. biopipen/scripts/snp/PlinkFromVcf.py +78 -0
  57. biopipen/scripts/snp/PlinkHWE.R +80 -0
  58. biopipen/scripts/snp/PlinkHet.R +92 -0
  59. biopipen/scripts/snp/PlinkIBD.R +197 -0
  60. biopipen/scripts/snp/PlinkUpdateName.py +124 -0
  61. biopipen/scripts/stats/MetaPvalue.R +2 -1
  62. biopipen/scripts/stats/MetaPvalue1.R +70 -0
  63. biopipen/scripts/tcr/TCRClusterStats.R +12 -7
  64. biopipen/scripts/vcf/BcftoolsAnnotate.py +91 -0
  65. biopipen/scripts/vcf/BcftoolsFilter.py +90 -0
  66. biopipen/scripts/vcf/BcftoolsSort.py +113 -0
  67. biopipen/scripts/vcf/BcftoolsView.py +73 -0
  68. biopipen/scripts/vcf/VcfFix_utils.py +1 -1
  69. biopipen/scripts/vcf/bcftools_utils.py +52 -0
  70. biopipen/utils/gene.R +83 -37
  71. biopipen/utils/gene.py +108 -60
  72. biopipen/utils/misc.R +56 -0
  73. biopipen/utils/misc.py +5 -2
  74. biopipen/utils/reference.py +54 -10
  75. {biopipen-0.28.0.dist-info → biopipen-0.29.0.dist-info}/METADATA +2 -2
  76. {biopipen-0.28.0.dist-info → biopipen-0.29.0.dist-info}/RECORD +78 -50
  77. {biopipen-0.28.0.dist-info → biopipen-0.29.0.dist-info}/entry_points.txt +1 -1
  78. biopipen/ns/bcftools.py +0 -111
  79. biopipen/scripts/bcftools/BcftoolsAnnotate.py +0 -42
  80. biopipen/scripts/bcftools/BcftoolsFilter.py +0 -79
  81. biopipen/scripts/bcftools/BcftoolsSort.py +0 -19
  82. biopipen/scripts/gene/GeneNameConversion.py +0 -66
  83. {biopipen-0.28.0.dist-info → biopipen-0.29.0.dist-info}/WHEEL +0 -0
@@ -0,0 +1,298 @@
1
+ source("{{biopipen_dir}}/utils/misc.R")
2
+ source("{{biopipen_dir}}/utils/plot.R")
3
+ library(rlang)
4
+ library(ggprism)
5
+ theme_set(theme_prism())
6
+
7
+ indir <- {{in.indir | r}}
8
+ outdir <- {{out.outdir | r}}
9
+ plink <- {{envs.plink | r}}
10
+ ncores <- {{envs.ncores | r}}
11
+ modifier <- {{envs.modifier | r}}
12
+ gz <- {{envs.gz | r}}
13
+ cutoffs <- {{envs.cutoff | r}}
14
+ filters <- {{envs.filter | r}}
15
+ doplot <- {{envs.plot | r}}
16
+ devpars <- {{envs.devpars | r}}
17
+
18
+ bedfile = Sys.glob(file.path(indir, '*.bed'))
19
+ if (length(bedfile) == 0)
20
+ stop("No bed files found in the input directory.")
21
+ if (length(bedfile) > 1) {
22
+ log_warn("Multiple bed files found in the input directory. Using the first one.")
23
+ bedfile <- bedfile[1]
24
+ }
25
+ input <- tools::file_path_sans_ext(bedfile)
26
+ output <- file.path(outdir, basename(input))
27
+
28
+ modifier <- match.arg(modifier, c("none", "counts", "x"))
29
+
30
+ cmd <- c(
31
+ plink,
32
+ "--threads", ncores,
33
+ "--bfile", input,
34
+ "--out", output
35
+ )
36
+ if (modifier == "counts") {
37
+ cmd <- c(cmd, "--freq", "counts")
38
+ if (!is.list(cutoffs)) { cutoffs <- list(ALT1_CT = cutoffs) }
39
+ # } else if (modifier == "case-control") {
40
+ # cmd <- c(cmd, "--freq", "case-control")
41
+ # if (!is.list(cutoffs)) { cutoffs <- list(MAF_A = cutoffs) }
42
+ } else if (modifier == "x") {
43
+ cmd <- c(cmd, "--geno-counts")
44
+ if (!is.list(cutoffs)) { cutoffs <- list("HOM_ALT1_CT" = cutoffs) }
45
+ } else {
46
+ cmd <- c(cmd, "--freq")
47
+ if (!is.list(cutoffs)) { cutoffs <- list(MAF = cutoffs) }
48
+ }
49
+ if (isTRUE(gz)) { cmd <- c(cmd, "gz") }
50
+
51
+ if (!is.list(filters)) {
52
+ filters <- as.list(rep(filters, length(cutoffs)))
53
+ names(filters) <- names(cutoffs)
54
+ } else {
55
+ for (name in names(filters)) {
56
+ if (is.null(cutoffs[[name]])) {
57
+ stop(paste0("Cutoff for filter ", name, " is not provided."))
58
+ }
59
+ }
60
+ }
61
+
62
+ run_command(cmd, fg = TRUE)
63
+
64
+ post_process <- function(suffix, snp_col = "ID", sep = "\t", modifier = NULL) {
65
+ freq <- read.table(
66
+ paste0(output, suffix),
67
+ header=TRUE,
68
+ check.names=FALSE,
69
+ row.names = NULL,
70
+ sep = sep,
71
+ comment = ""
72
+ )
73
+ colnames(freq)[1] <- sub("#", "", colnames(freq)[1])
74
+ if (!is.null(modifier)) { freq <- modifier(freq) }
75
+ iter_in <- input
76
+ n <- 0
77
+ for (metric_col in names(cutoffs)) {
78
+ if (is.null(cutoffs[[metric_col]])) {
79
+ stop(paste0(
80
+ "Cutoff for metric ",
81
+ metric_col,
82
+ " is not provided in ",
83
+ suffix, "(x) file."))
84
+ }
85
+
86
+ freq[[metric_col]] <- as.numeric(freq[[metric_col]])
87
+ cutoff <- cutoffs[[metric_col]]
88
+ filter <- filters[[metric_col]] %||% "no"
89
+
90
+ if (filter == "no") {
91
+ ge_flag <- paste0(metric_col, " >= ", cutoff)
92
+ lt_flag <- paste0(metric_col, " < ", cutoff)
93
+ freq$GE <- freq[[metric_col]] >= cutoff
94
+ freq$Flag <- ifelse(freq$GE, ge_flag, lt_flag)
95
+ freq$Flag <- factor(freq$Flag, levels = c(ge_flag, lt_flag))
96
+ write.table(
97
+ freq[[snp_col]][freq$GE],
98
+ file = paste0(output, suffix, ".", metric_col, ".ge"),
99
+ col.names=FALSE,
100
+ row.names=FALSE,
101
+ quote=FALSE
102
+ )
103
+ write.table(
104
+ freq[[snp_col]][!freq$GE],
105
+ file = paste0(output, suffix, ".", metric_col, ".lt"),
106
+ col.names=FALSE,
107
+ row.names=FALSE,
108
+ quote=FALSE
109
+ )
110
+
111
+ if (doplot) {
112
+ plotGG(
113
+ data = freq,
114
+ geom = "histogram",
115
+ outfile = paste0(output, suffix, ".", metric_col, ".png"),
116
+ args = list(aes(x = !!sym(metric_col), fill = Flag), alpha = 0.8, bins = 50),
117
+ ggs = c(
118
+ sprintf('xlab("%s")', metric_col),
119
+ 'ylab("Count")',
120
+ sprintf('geom_vline(xintercept = %.3f, color = "red", linetype="dashed")', cutoff),
121
+ sprintf(
122
+ 'geom_text(aes(x = %.3f, y = Inf, label = as.character(%.3f)), colour="blue", vjust = 1.5, hjust = -.1)',
123
+ cutoff, cutoff
124
+ ),
125
+ sprintf(
126
+ 'scale_fill_manual(values = c("%s" = "blue3", "%s" = "green3"))',
127
+ ge_flag, lt_flag
128
+ )
129
+ ),
130
+ devpars = devpars
131
+ )
132
+ }
133
+ } else {
134
+ iter_dir <- file.path(outdir, paste0(metric_col, "_filtered"))
135
+ dir.create(iter_dir, showWarnings = FALSE)
136
+ iter_out <- file.path(iter_dir, basename(output))
137
+
138
+ filter <- match.arg(filter, c("gt", "lt", "ge", "le"))
139
+ indicate <- function(metric){
140
+ if (filter == "gt") {
141
+ return(freq[[metric_col]] > cutoff)
142
+ } else if (filter == "lt") {
143
+ return(freq[[metric_col]] < cutoff)
144
+ } else if (filter == "ge") {
145
+ return(freq[[metric_col]] >= cutoff)
146
+ } else if (filter == "le") {
147
+ return(freq[[metric_col]] <= cutoff)
148
+ }
149
+ }
150
+ freq$Flag <- ifelse(indicate(freq), "Fail", "Pass")
151
+ failfile <- paste0(output, suffix, ".", metric_col, ".fail")
152
+ write.table(
153
+ freq[[snp_col]][freq$Flag == "Fail"],
154
+ file = failfile,
155
+ col.names=FALSE,
156
+ row.names=FALSE,
157
+ quote=FALSE
158
+ )
159
+
160
+ if (doplot) {
161
+ plotGG(
162
+ data = freq,
163
+ geom = "histogram",
164
+ outfile = paste0(output, suffix, ".", metric_col, ".png"),
165
+ args = list(aes(x = !!sym(metric_col), fill = Flag), alpha = 0.8, bins = 50),
166
+ ggs = c(
167
+ sprintf('xlab("%s")', metric_col),
168
+ 'ylab("Count")',
169
+ sprintf('geom_vline(xintercept = %.3f, color = "blue", linetype="dashed")', cutoff),
170
+ sprintf(
171
+ 'geom_text(aes(x = %.3f, y = Inf, label = as.character(%.3f)), colour="blue", vjust = 1.5, hjust = -.1)',
172
+ cutoff, cutoff
173
+ ),
174
+ 'theme(legend.position = "none")',
175
+ 'scale_fill_manual(values = c("Pass" = "blue3", "Fail" = "red3"))'
176
+ ),
177
+ devpars = devpars
178
+ )
179
+ }
180
+
181
+ filter_cmd <- c(
182
+ plink,
183
+ "--threads", ncores,
184
+ "--bfile", shQuote(iter_in),
185
+ "--exclude", shQuote(failfile),
186
+ "--make-bed",
187
+ "--out", shQuote(iter_out)
188
+ )
189
+ run_command(filter_cmd, fg = TRUE)
190
+
191
+ iter_in <- iter_out
192
+ n <- n + 1
193
+
194
+ if (n == length(cutoffs)) {
195
+ # make symbolic links to output from input .bed, .bim and .fam files
196
+ file.symlink(paste0(iter_in, '.bed'), paste0(output, '.bed'))
197
+ file.symlink(paste0(iter_in, '.bim'), paste0(output, '.bim'))
198
+ file.symlink(paste0(iter_in, '.fam'), paste0(output, '.fam'))
199
+ }
200
+ }
201
+ }
202
+ }
203
+
204
+ splitup <- function(x, agg = NULL) {
205
+ sp <- strsplit(as.character(x), ",")
206
+ if (is.null(agg)) {
207
+ return(sp)
208
+ }
209
+ return(sapply(sp, agg))
210
+ }
211
+ if (modifier == "none") {
212
+ mod <- function(freq) {
213
+ # Add ALT1, ALT1_FREQ, REF_FREQ and MAF columns
214
+ writing = FALSE
215
+ if (is.null(freq$ALT1)) {
216
+ # should be the first allele of ALT
217
+ freq$ALT1 <- splitup(freq$ALT, agg = function(s) s[1])
218
+ writing = TRUE
219
+ }
220
+ if (is.null(freq$ALT1_FREQ)) {
221
+ freq$ALT1_FREQ <- as.double(splitup(freq$ALT_FREQS, agg = function(s) s[1]))
222
+ writing = TRUE
223
+ }
224
+ if (is.null(freq$REF_FREQ)) {
225
+ freq$REF_FREQ <- 1 - splitup(freq$ALT_FREQS, agg = function(s) sum(as.double(s)))
226
+ writing = TRUE
227
+ }
228
+ if (is.null(freq$MAF)) {
229
+ min_alt_freqs <- splitup(freq$ALT_FREQS, agg = function(s) min(as.double(s)))
230
+ freq$MAF <- pmin(freq$REF_FREQ, min_alt_freqs)
231
+ writing = TRUE
232
+ }
233
+ if (writing) {
234
+ write.table(
235
+ freq,
236
+ file = paste0(output, ".afreqx"),
237
+ col.names=TRUE,
238
+ row.names=FALSE,
239
+ quote=FALSE,
240
+ sep = "\t"
241
+ )
242
+ }
243
+ return(freq)
244
+ }
245
+ post_process(".afreq", modifier = mod)
246
+ } else if (modifier == "counts") {
247
+ mod <- function(freq) {
248
+ # Add ALT1, ALT1_CT, and REF_CT columns
249
+ writing = FALSE
250
+ if (is.null(freq$ALT1)) {
251
+ # should be the first allele of ALT
252
+ freq$ALT1 <- splitup(freq$ALT, agg = function(s) s[1])
253
+ writing = TRUE
254
+ }
255
+ if (is.null(freq$ALT1_CT)) {
256
+ freq$ALT1_CT <- as.integer(splitup(freq$ALT_CTS, agg = function(s) s[1]))
257
+ writing = TRUE
258
+ }
259
+ if (is.null(freq$REF_CT)) {
260
+ freq$REF_CT <- freq$OBS_CT - splitup(freq$ALT_CTS, agg = function(s) sum(as.integer(s)))
261
+ writing = TRUE
262
+ }
263
+ if (writing) {
264
+ write.table(
265
+ freq,
266
+ file = paste0(output, ".acountx"),
267
+ col.names=TRUE,
268
+ row.names=FALSE,
269
+ quote=FALSE,
270
+ sep = "\t"
271
+ )
272
+ }
273
+ return(freq)
274
+ }
275
+ post_process(".acount", modifier = mod)
276
+ # } else if (modifier == "case-control") {
277
+ # post_process(".frq.cc")
278
+ } else if (modifier == "x") {
279
+ mod <- function(freq) {
280
+ # Add ALT1, HET_REF_ALT1_CT, HOM_ALT1_CT
281
+ writing = FALSE
282
+ if (is.null(freq$ALT1)) {
283
+ # should be the first allele of ALT
284
+ freq$ALT1 <- splitup(freq$ALT, agg = function(s) s[1])
285
+ writing = TRUE
286
+ }
287
+ if (is.null(freq$HET_REF_ALT1_CT)) {
288
+ freq$HET_REF_ALT1_CT <- as.integer(splitup(freq$HET_REF_ALT_CTS, agg = function(s) s[1]))
289
+ writing = TRUE
290
+ }
291
+ if (is.null(freq$HOM_ALT1_CT)) {
292
+ freq$HOM_ALT1_CT <- as.integer(splitup(freq$TWO_ALT_GENO_CTS, agg = function(s) s[1]))
293
+ writing = TRUE
294
+ }
295
+ return(freq)
296
+ }
297
+ post_process(".gcount", modifier = mod)
298
+ }
@@ -0,0 +1,78 @@
1
+ from os import path
2
+ from biopipen.core.filters import dict_to_cli_args
3
+ from biopipen.utils.reference import tabix_index
4
+ from biopipen.utils.misc import run_command
5
+
6
+ invcf = {{in.invcf | repr}} # noqa: E999 # pyright: ignore
7
+ outprefix = {{in.invcf | stem0 | repr}} # pyright: ignore
8
+ outdir = {{out.outdir | repr}} # pyright: ignore
9
+ args = {{envs | dict | repr}} # pyright: ignore
10
+
11
+ plink = args.pop("plink")
12
+ tabix = args.pop("tabix")
13
+ ncores = args.pop("ncores")
14
+
15
+ # normalize vcf-filter
16
+ args.setdefault("vcf_filter", True)
17
+ if isinstance(args["vcf_filter"], str):
18
+ args["vcf_filter"] = args["vcf_filter"].split()
19
+
20
+ # normalize biallelic-only
21
+ args.setdefault("max_alleles", 2)
22
+
23
+ # This makes it possible to keep the allele order in the output
24
+ # no need for plink2
25
+ # args["keep_allele_order"] = True
26
+
27
+ # resolve plink 1.x --set-missing-var-ids doesn't distinguish $1, $2,...
28
+ # for ref and alts
29
+ # if (
30
+ # "set_missing_var_ids" in args
31
+ # and args["set_missing_var_ids"]
32
+ # and ("$" in args["set_missing_var_ids"] or "%" in args["set_missing_var_ids"])
33
+ # ):
34
+ # tmpfile = path.join(outdir, 'with_var_ids.vcf')
35
+ # set_missing_var_ids = args.pop("set_missing_var_ids")
36
+ # set_missing_var_ids = (
37
+ # set_missing_var_ids
38
+ # .replace("@", "%CHROM")
39
+ # .replace("#", "%POS")
40
+ # .replace("$1", "%REF")
41
+ # .replace("$2", "%ALT{0}")
42
+ # .replace("$3", "%ALT{1}")
43
+ # .replace("$4", "%ALT{2}")
44
+ # .replace("$5", "%ALT{3}")
45
+ # .replace("$6", "%ALT{4}")
46
+ # .replace("%CHROM_", "%CHROM\\_")
47
+ # .replace("%POS_", "%POS\\_")
48
+ # .replace("%REF_", "%REF\\_")
49
+ # )
50
+ # set_vid_cmd = [
51
+ # bcftools,
52
+ # "annotate",
53
+ # "--set-id",
54
+ # f"+{set_missing_var_ids}",
55
+ # "--output-type",
56
+ # "z",
57
+ # "--output",
58
+ # tmpfile,
59
+ # "--threads",
60
+ # ncores,
61
+ # invcf,
62
+ # ]
63
+
64
+ # run_command(set_vid_cmd, fg=True, env={"cwd": outdir})
65
+ # invcf = tmpfile
66
+
67
+ invcf = tabix_index(invcf, "vcf", tabix=tabix)
68
+ args["vcf"] = invcf
69
+ args["out"] = path.join(outdir, outprefix)
70
+ args["threads"] = ncores
71
+
72
+ cmd = [
73
+ plink,
74
+ "--make-bed",
75
+ *dict_to_cli_args(args, dup_key=False, dashify = True),
76
+ ]
77
+
78
+ run_command(cmd, fg=True, env={"cwd": outdir})
@@ -0,0 +1,80 @@
1
+ source("{{biopipen_dir}}/utils/misc.R")
2
+ source("{{biopipen_dir}}/utils/plot.R")
3
+ library(ggprism)
4
+ theme_set(theme_prism())
5
+
6
+ indir <- {{in.indir | r}}
7
+ outdir <- {{out.outdir | r}}
8
+ plink <- {{envs.plink | r}}
9
+ ncores <- {{envs.ncores | r}}
10
+ cutoff <- {{envs.cutoff | r}}
11
+ doplot <- {{envs.plot | r}}
12
+ devpars <- {{envs.devpars | r}}
13
+
14
+ bedfile = Sys.glob(file.path(indir, '*.bed'))
15
+ if (length(bedfile) == 0)
16
+ stop("No bed files found in the input directory.")
17
+ if (length(bedfile) > 1) {
18
+ log_warn("Multiple bed files found in the input directory. Using the first one.")
19
+ bedfile <- bedfile[1]
20
+ }
21
+ input <- tools::file_path_sans_ext(bedfile)
22
+ output <- file.path(outdir, basename(input))
23
+
24
+ cmd <- c(
25
+ plink,
26
+ "--threads", ncores,
27
+ "--bfile", input,
28
+ "--hardy",
29
+ "--out", output
30
+ )
31
+ run_command(cmd, fg = TRUE)
32
+
33
+ hardy <- read.table(
34
+ paste0(output, '.hardy'),
35
+ header = TRUE,
36
+ row.names = NULL,
37
+ check.names = FALSE,
38
+ comment.char = ""
39
+ )
40
+ hardy.fail <- hardy[which(hardy$P < cutoff), 'ID', drop = FALSE]
41
+ write.table(
42
+ hardy.fail,
43
+ paste0(output, '.hardy.fail'),
44
+ col.names = FALSE,
45
+ row.names = FALSE,
46
+ sep = "\t",
47
+ quote = FALSE
48
+ )
49
+
50
+ if (doplot) {
51
+ hardy$Pval <- -log10(hardy$P)
52
+ hardy$Status <- "Pass"
53
+ hardy[which(hardy$SNP %in% hardy.fail$SNP), "Status"] <- "Fail"
54
+
55
+ plotGG(
56
+ data = hardy,
57
+ geom = "histogram",
58
+ outfile = paste0(output, '.hardy.png'),
59
+ args = list(aes(x = Pval, fill = Status), alpha = 0.8, bins = 50),
60
+ ggs = c(
61
+ 'xlab("-log10(HWE p-value)")',
62
+ 'ylab("Count")',
63
+ 'geom_vline(xintercept = -log10(cutoff), color = "red", linetype="dashed")',
64
+ 'theme(legend.position = "none")',
65
+ 'geom_text(aes(x = -log10(cutoff), y = Inf, label = cutoff), colour="red", angle=90, vjust = 1.2, hjust = 1.2)',
66
+ 'scale_fill_manual(values = c("Pass" = "blue3", "Fail" = "red3"))' # Added line to set "Fail" color to red
67
+ ),
68
+ devpars = devpars
69
+ )
70
+ }
71
+
72
+ cmd <- c(
73
+ plink,
74
+ "--threads", ncores,
75
+ "--bfile", input,
76
+ "--exclude", paste0(output, '.hardy.fail'),
77
+ "--make-bed",
78
+ "--out", output
79
+ )
80
+ run_command(cmd, fg = TRUE)
@@ -0,0 +1,92 @@
1
+ source("{{biopipen_dir}}/utils/misc.R")
2
+ source("{{biopipen_dir}}/utils/plot.R")
3
+ library(ggprism)
4
+ theme_set(theme_prism())
5
+
6
+ indir <- {{in.indir | r}}
7
+ outdir <- {{out.outdir | r}}
8
+ plink <- {{envs.plink | r}}
9
+ ncores <- {{envs.ncores | r}}
10
+ cutoff <- {{envs.cutoff | r}}
11
+ doplot <- {{envs.plot | r}}
12
+ devpars <- {{envs.devpars | r}}
13
+
14
+ bedfile = Sys.glob(file.path(indir, '*.bed'))
15
+ if (length(bedfile) == 0)
16
+ stop("No bed files found in the input directory.")
17
+ if (length(bedfile) > 1) {
18
+ log_warn("Multiple bed files found in the input directory. Using the first one.")
19
+ bedfile <- bedfile[1]
20
+ }
21
+ input <- tools::file_path_sans_ext(bedfile)
22
+ output <- file.path(outdir, basename(input))
23
+
24
+ # need .afreq for --het for plink2
25
+ freq_cmd <- cmd <- c(
26
+ plink,
27
+ "--threads", ncores,
28
+ "--bfile", input,
29
+ "--freq",
30
+ "--out", output
31
+ )
32
+ run_command(freq_cmd, fg = TRUE)
33
+
34
+ cmd <- c(
35
+ plink,
36
+ "--threads", ncores,
37
+ "--bfile", input,
38
+ "--het",
39
+ "--out", output,
40
+ "--read-freq", paste0(output, '.afreq')
41
+ )
42
+ run_command(cmd, fg = TRUE)
43
+
44
+ phet <- read.table(
45
+ paste0(output, '.het'),
46
+ header = TRUE,
47
+ row.names = NULL,
48
+ check.names = FALSE,
49
+ comment.char = ""
50
+ )
51
+ het <- data.frame(Het = 1 - phet[, "O(HOM)"]/phet[, "OBS_CT"])
52
+ rownames(het) <- paste(phet$FID, phet$IID, sep = "\t")
53
+ het.mean <- mean(het$Het, na.rm = TRUE)
54
+ het.sd <- sd(het$Het, na.rm = TRUE)
55
+ het.fail <- rownames(het[
56
+ !is.na(het$Het) & (het$Het < het.mean-cutoff*het.sd | het$Het > het.mean+cutoff*het.sd), , drop = FALSE
57
+ ])
58
+ writeLines(het.fail, con = file(paste0(output, '.het.fail')))
59
+
60
+ if (doplot) {
61
+ het$Status <- "Pass"
62
+ het[het.fail, "Status"] <- "Fail"
63
+
64
+ plotGG(
65
+ data = het,
66
+ geom = "histogram",
67
+ outfile = paste0(output, '.het.png'),
68
+ args = list(aes(fill = Status, x = Het), alpha = 0.8, bins = 50),
69
+ ggs = c(
70
+ 'xlab("Sample Heterozygosity")',
71
+ 'ylab("Count")',
72
+ 'geom_vline(xintercept = c(het.mean-cutoff*het.sd, het.mean+cutoff*het.sd), color = "red", linetype="dashed")',
73
+ 'geom_vline(xintercept = het.mean, color = "blue", linetype="dashed")',
74
+ 'theme(legend.position = "none")',
75
+ 'geom_text(aes(x = het.mean-cutoff*het.sd, y = Inf, label = sprintf("mean - %ssd (%.3f)", cutoff, het.mean - cutoff*het.sd)), colour="red", angle=90, vjust = 1.2, hjust = 1.2)',
76
+ 'geom_text(aes(x = het.mean+cutoff*het.sd, y = Inf, label = sprintf("mean + %ssd (%.3f)", cutoff, het.mean + cutoff*het.sd)), colour="red", angle=90, vjust = 1.2, hjust = 1.2)',
77
+ 'geom_text(aes(x = het.mean, y = Inf, label = sprintf("mean (%.3f)", het.mean)), colour="blue", vjust = 1.5, hjust = -.1)',
78
+ 'scale_fill_manual(values = c("Pass" = "blue3", "Fail" = "red3"))'
79
+ ),
80
+ devpars = devpars
81
+ )
82
+ }
83
+
84
+ cmd <- c(
85
+ plink,
86
+ "--threads", ncores,
87
+ "--bfile", input,
88
+ "--remove", paste0(output, '.het.fail'),
89
+ "--make-bed",
90
+ "--out", output
91
+ )
92
+ run_command(cmd, fg = TRUE)