biopipen 0.28.0__py3-none-any.whl → 0.29.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of biopipen might be problematic. Click here for more details.
- biopipen/__init__.py +1 -1
- biopipen/core/config.toml +8 -0
- biopipen/ns/bam.py +0 -2
- biopipen/ns/bed.py +35 -0
- biopipen/ns/cellranger_pipeline.py +5 -5
- biopipen/ns/cnv.py +18 -2
- biopipen/ns/cnvkit_pipeline.py +16 -11
- biopipen/ns/gene.py +68 -23
- biopipen/ns/misc.py +2 -15
- biopipen/ns/plot.py +146 -0
- biopipen/ns/regulation.py +214 -0
- biopipen/ns/scrna.py +15 -3
- biopipen/ns/snp.py +516 -8
- biopipen/ns/stats.py +74 -2
- biopipen/ns/vcf.py +196 -0
- biopipen/reports/snp/PlinkCallRate.svelte +24 -0
- biopipen/reports/snp/PlinkFreq.svelte +18 -0
- biopipen/reports/snp/PlinkHWE.svelte +18 -0
- biopipen/reports/snp/PlinkHet.svelte +18 -0
- biopipen/reports/snp/PlinkIBD.svelte +18 -0
- biopipen/scripts/bam/CNVpytor.py +144 -46
- biopipen/scripts/bed/BedtoolsIntersect.py +54 -0
- biopipen/scripts/bed/BedtoolsMerge.py +1 -1
- biopipen/scripts/cnv/AneuploidyScore.R +30 -7
- biopipen/scripts/cnv/AneuploidyScoreSummary.R +5 -2
- biopipen/scripts/cnv/TMADScore.R +21 -5
- biopipen/scripts/cnv/TMADScoreSummary.R +6 -2
- biopipen/scripts/cnvkit/CNVkitAccess.py +2 -1
- biopipen/scripts/cnvkit/CNVkitAutobin.py +3 -2
- biopipen/scripts/cnvkit/CNVkitBatch.py +1 -1
- biopipen/scripts/cnvkit/CNVkitCoverage.py +2 -1
- biopipen/scripts/cnvkit/CNVkitGuessBaits.py +1 -1
- biopipen/scripts/cnvkit/CNVkitHeatmap.py +1 -1
- biopipen/scripts/cnvkit/CNVkitReference.py +2 -1
- biopipen/scripts/gene/GeneNameConversion.R +65 -0
- biopipen/scripts/gene/GenePromoters.R +61 -0
- biopipen/scripts/misc/Shell.sh +15 -0
- biopipen/scripts/plot/Manhattan.R +140 -0
- biopipen/scripts/plot/QQPlot.R +62 -0
- biopipen/scripts/regulation/MotifAffinityTest.R +226 -0
- biopipen/scripts/regulation/MotifAffinityTest_AtSNP.R +126 -0
- biopipen/scripts/regulation/MotifAffinityTest_MotifBreakR.R +96 -0
- biopipen/scripts/regulation/MotifScan.py +159 -0
- biopipen/scripts/regulation/atSNP.R +33 -0
- biopipen/scripts/regulation/motifBreakR.R +1594 -0
- biopipen/scripts/scrna/CellsDistribution.R +2 -0
- biopipen/scripts/scrna/MarkersFinder.R +59 -67
- biopipen/scripts/scrna/SeuratClustering.R +63 -29
- biopipen/scripts/scrna/SeuratMap2Ref.R +20 -0
- biopipen/scripts/scrna/SeuratSubClustering.R +76 -27
- biopipen/scripts/snp/MatrixEQTL.R +84 -43
- biopipen/scripts/snp/Plink2GTMat.py +133 -0
- biopipen/scripts/snp/PlinkCallRate.R +190 -0
- biopipen/scripts/snp/PlinkFilter.py +100 -0
- biopipen/scripts/snp/PlinkFreq.R +298 -0
- biopipen/scripts/snp/PlinkFromVcf.py +78 -0
- biopipen/scripts/snp/PlinkHWE.R +80 -0
- biopipen/scripts/snp/PlinkHet.R +92 -0
- biopipen/scripts/snp/PlinkIBD.R +197 -0
- biopipen/scripts/snp/PlinkUpdateName.py +124 -0
- biopipen/scripts/stats/MetaPvalue.R +2 -1
- biopipen/scripts/stats/MetaPvalue1.R +70 -0
- biopipen/scripts/tcr/TCRClusterStats.R +12 -7
- biopipen/scripts/vcf/BcftoolsAnnotate.py +91 -0
- biopipen/scripts/vcf/BcftoolsFilter.py +90 -0
- biopipen/scripts/vcf/BcftoolsSort.py +113 -0
- biopipen/scripts/vcf/BcftoolsView.py +73 -0
- biopipen/scripts/vcf/VcfFix_utils.py +1 -1
- biopipen/scripts/vcf/bcftools_utils.py +52 -0
- biopipen/utils/gene.R +83 -37
- biopipen/utils/gene.py +108 -60
- biopipen/utils/misc.R +56 -0
- biopipen/utils/misc.py +5 -2
- biopipen/utils/reference.py +54 -10
- {biopipen-0.28.0.dist-info → biopipen-0.29.0.dist-info}/METADATA +2 -2
- {biopipen-0.28.0.dist-info → biopipen-0.29.0.dist-info}/RECORD +78 -50
- {biopipen-0.28.0.dist-info → biopipen-0.29.0.dist-info}/entry_points.txt +1 -1
- biopipen/ns/bcftools.py +0 -111
- biopipen/scripts/bcftools/BcftoolsAnnotate.py +0 -42
- biopipen/scripts/bcftools/BcftoolsFilter.py +0 -79
- biopipen/scripts/bcftools/BcftoolsSort.py +0 -19
- biopipen/scripts/gene/GeneNameConversion.py +0 -66
- {biopipen-0.28.0.dist-info → biopipen-0.29.0.dist-info}/WHEEL +0 -0
|
@@ -0,0 +1,133 @@
|
|
|
1
|
+
|
|
2
|
+
from os import path
|
|
3
|
+
from glob import glob
|
|
4
|
+
from biopipen.utils.misc import run_command, logger
|
|
5
|
+
|
|
6
|
+
indir = {{in.indir | repr}} # noqa: E999 # pyright: ignore
|
|
7
|
+
outfile = {{out.outfile | repr}} # pyright: ignore
|
|
8
|
+
plink = {{envs.plink | repr}} # pyright: ignore
|
|
9
|
+
ncores = {{envs.ncores | repr}} # pyright: ignore
|
|
10
|
+
transpose = {{envs.transpose | repr}} # pyright: ignore
|
|
11
|
+
samid = {{envs.samid | repr}} # pyright: ignore
|
|
12
|
+
varid = {{envs.varid | repr}} # pyright: ignore
|
|
13
|
+
trans_chr = {{envs.trans_chr | repr}} # pyright: ignore
|
|
14
|
+
missing_id = {{envs.missing_id | repr}} # pyright: ignore
|
|
15
|
+
trans_chr = trans_chr or {}
|
|
16
|
+
|
|
17
|
+
bedfile = glob(path.join(indir, '*.bed'))
|
|
18
|
+
if len(bedfile) == 0:
|
|
19
|
+
raise FileNotFoundError(f"No .bed file found in `in.indir`")
|
|
20
|
+
elif len(bedfile) > 1:
|
|
21
|
+
logger.warning(f"Multiple .bed files found in `in.indir`, using the first one.")
|
|
22
|
+
|
|
23
|
+
bedfile = bedfile[0]
|
|
24
|
+
input = path.splitext(bedfile)[0]
|
|
25
|
+
output = path.splitext(outfile)[0]
|
|
26
|
+
|
|
27
|
+
cmd = [
|
|
28
|
+
plink,
|
|
29
|
+
"--bfile", input,
|
|
30
|
+
"--out", output,
|
|
31
|
+
"--threads", ncores,
|
|
32
|
+
"--keep-allele-order",
|
|
33
|
+
"--recode", "A-transpose" if not transpose else "A",
|
|
34
|
+
]
|
|
35
|
+
# if transpose:
|
|
36
|
+
# cmd += ["tabx"]
|
|
37
|
+
|
|
38
|
+
run_command(cmd, fg=True, env={"cwd": path.dirname(outfile)})
|
|
39
|
+
|
|
40
|
+
if not transpose: # rows are variants, columns are samples
|
|
41
|
+
# .traw file is created, tab-separated, with the following columns:
|
|
42
|
+
trawfile = output + ".traw"
|
|
43
|
+
# CHR Chromosome code
|
|
44
|
+
# SNP Variant identifier
|
|
45
|
+
# (C)M Position in morgans or centimorgans
|
|
46
|
+
# POS Base-pair coordinate
|
|
47
|
+
# COUNTED Counted allele (defaults to A1), the actual alternative allele
|
|
48
|
+
# with --keep-allele-order
|
|
49
|
+
# ALT Other allele(s), comma-separated, the actual reference allele
|
|
50
|
+
# <FID>_<IID>... Allelic dosages
|
|
51
|
+
# (0/1/2/'NA' for diploid variants, 0/2/'NA' for haploid)
|
|
52
|
+
with open(trawfile, 'r') as fin:
|
|
53
|
+
with open(outfile, 'w') as fout:
|
|
54
|
+
samples = fin.readline().strip().split('\t')[6:]
|
|
55
|
+
header = ["Variant"]
|
|
56
|
+
for sam in samples:
|
|
57
|
+
try:
|
|
58
|
+
fid, iid = sam.split('_')
|
|
59
|
+
except ValueError:
|
|
60
|
+
raise ValueError(
|
|
61
|
+
f"Can't determine FID and IID from sample ID: {sam}, "
|
|
62
|
+
f"extra underscore (_) detected."
|
|
63
|
+
) from None
|
|
64
|
+
sam = samid.replace('{fid}', fid).replace('{iid}', iid)
|
|
65
|
+
header.append(sam)
|
|
66
|
+
fout.write('\t'.join(header) + '\n')
|
|
67
|
+
|
|
68
|
+
for line in fin:
|
|
69
|
+
line = line.strip().split('\t')
|
|
70
|
+
chrom = trans_chr.get(line[0], line[0])
|
|
71
|
+
var = line[1]
|
|
72
|
+
if var == "." or var == "":
|
|
73
|
+
var = missing_id
|
|
74
|
+
pos = line[3]
|
|
75
|
+
ref = line[5]
|
|
76
|
+
alt = line[4]
|
|
77
|
+
variant = (
|
|
78
|
+
varid
|
|
79
|
+
.replace('{chr}', chrom)
|
|
80
|
+
.replace('{varid}', var)
|
|
81
|
+
.replace('{pos}', pos)
|
|
82
|
+
.replace('{ref}', ref)
|
|
83
|
+
.replace('{alt}', alt)
|
|
84
|
+
)
|
|
85
|
+
record = [variant] + line[6:]
|
|
86
|
+
fout.write('\t'.join(record) + '\n')
|
|
87
|
+
|
|
88
|
+
else:
|
|
89
|
+
# .raw file is created, tab-separated, with the following columns:
|
|
90
|
+
rawfile = output + ".raw"
|
|
91
|
+
# FID Family ID
|
|
92
|
+
# IID Individual ID
|
|
93
|
+
# PAT Paternal ID
|
|
94
|
+
# MAT Maternal ID
|
|
95
|
+
# SEX Sex (1 = male, 2 = female, 0 = unknown)
|
|
96
|
+
# PHENOTYPE Main phenotype value
|
|
97
|
+
# <VariantID>... Allelic dosage (0/1/2/NA for diploid variants, 0/2/NA for haploid)
|
|
98
|
+
#
|
|
99
|
+
# Variant information may not be included in <VariantID>
|
|
100
|
+
# We use the .bim file to get the variant information
|
|
101
|
+
bimfile = input + ".bim"
|
|
102
|
+
with open(rawfile, 'r') as fin:
|
|
103
|
+
with open(outfile, 'w') as fout:
|
|
104
|
+
header = ["Sample"]
|
|
105
|
+
with open(bimfile, 'r') as fbim:
|
|
106
|
+
for line in fbim:
|
|
107
|
+
line = line.strip().split('\t')
|
|
108
|
+
chrom = trans_chr.get(line[0], line[0])
|
|
109
|
+
var = line[1]
|
|
110
|
+
if var == "." or var == "":
|
|
111
|
+
var = missing_id
|
|
112
|
+
pos = line[3]
|
|
113
|
+
ref = line[5]
|
|
114
|
+
alt = line[4]
|
|
115
|
+
variant = (
|
|
116
|
+
varid
|
|
117
|
+
.replace('{chr}', chrom)
|
|
118
|
+
.replace('{varid}', var)
|
|
119
|
+
.replace('{pos}', pos)
|
|
120
|
+
.replace('{ref}', ref)
|
|
121
|
+
.replace('{alt}', alt)
|
|
122
|
+
)
|
|
123
|
+
header.append(variant)
|
|
124
|
+
fout.write('\t'.join(header) + '\n')
|
|
125
|
+
|
|
126
|
+
next(fin) # skip header
|
|
127
|
+
for line in fin:
|
|
128
|
+
line = line.strip().split('\t')
|
|
129
|
+
fid = line[0]
|
|
130
|
+
iid = line[1]
|
|
131
|
+
sam = samid.replace('{fid}', fid).replace('{iid}', iid)
|
|
132
|
+
record = [sam] + line[6:]
|
|
133
|
+
fout.write('\t'.join(record) + '\n')
|
|
@@ -0,0 +1,190 @@
|
|
|
1
|
+
source("{{biopipen_dir}}/utils/misc.R")
|
|
2
|
+
source("{{biopipen_dir}}/utils/plot.R")
|
|
3
|
+
library(ggprism)
|
|
4
|
+
theme_set(theme_prism())
|
|
5
|
+
|
|
6
|
+
indir <- {{in.indir | r}}
|
|
7
|
+
outdir <- {{out.outdir | r}}
|
|
8
|
+
plink <- {{envs.plink | r}}
|
|
9
|
+
ncores <- {{envs.ncores | r}}
|
|
10
|
+
doplot <- {{envs.plot | r}}
|
|
11
|
+
devpars <- {{envs.devpars | r}}
|
|
12
|
+
samplecr <- {{envs.samplecr | r}}
|
|
13
|
+
varcr <- {{envs.varcr | r}}
|
|
14
|
+
max_iter <- {{envs.max_iter | r}}
|
|
15
|
+
|
|
16
|
+
bedfile = Sys.glob(file.path(indir, '*.bed'))
|
|
17
|
+
if (length(bedfile) == 0)
|
|
18
|
+
stop("No bed files found in the input directory.")
|
|
19
|
+
if (length(bedfile) > 1) {
|
|
20
|
+
log_warn("Multiple bed files found in the input directory. Using the first one.")
|
|
21
|
+
bedfile <- bedfile[1]
|
|
22
|
+
}
|
|
23
|
+
input <- tools::file_path_sans_ext(bedfile)
|
|
24
|
+
output <- file.path(outdir, basename(input))
|
|
25
|
+
|
|
26
|
+
all_smiss_file = paste0(output, '.smiss')
|
|
27
|
+
all_vmiss_file = paste0(output, '.vmiss')
|
|
28
|
+
all_samplecr_fail_file = paste0(output, '.samplecr.fail')
|
|
29
|
+
all_varcr_fail_file = paste0(output, '.varcr.fail')
|
|
30
|
+
if (file.exists(all_smiss_file)) invisible(file.remove(all_smiss_file))
|
|
31
|
+
if (file.exists(all_vmiss_file)) invisible(file.remove(all_vmiss_file))
|
|
32
|
+
for (i in 1:max_iter) {
|
|
33
|
+
log_info("Iteration {i} ...")
|
|
34
|
+
# iter_out <- paste0(output, "-", i)
|
|
35
|
+
iter_dir <- file.path(outdir, paste0("iter", i))
|
|
36
|
+
dir.create(iter_dir, showWarnings = FALSE)
|
|
37
|
+
iter_out <- file.path(iter_dir, basename(output))
|
|
38
|
+
cmd <- c(
|
|
39
|
+
plink,
|
|
40
|
+
"--threads", ncores,
|
|
41
|
+
"--bfile", input,
|
|
42
|
+
"--missing",
|
|
43
|
+
"--out", iter_out
|
|
44
|
+
)
|
|
45
|
+
run_command(cmd, fg = TRUE)
|
|
46
|
+
|
|
47
|
+
smissfile <- paste0(iter_out, '.smiss')
|
|
48
|
+
smiss <- read.table(
|
|
49
|
+
smissfile,
|
|
50
|
+
header = TRUE,
|
|
51
|
+
row.names = NULL,
|
|
52
|
+
check.names = FALSE,
|
|
53
|
+
comment.char = ""
|
|
54
|
+
)
|
|
55
|
+
smiss$Iteration <- i
|
|
56
|
+
# append it to all_smiss_file
|
|
57
|
+
write.table(
|
|
58
|
+
smiss,
|
|
59
|
+
all_smiss_file,
|
|
60
|
+
append = i > 1,
|
|
61
|
+
col.names = !file.exists(all_smiss_file),
|
|
62
|
+
row.names = FALSE,
|
|
63
|
+
sep = "\t",
|
|
64
|
+
quote = FALSE
|
|
65
|
+
)
|
|
66
|
+
callrate.sample <- data.frame(Callrate = 1 - smiss$F_MISS)
|
|
67
|
+
rownames(callrate.sample) <- paste(smiss$FID, smiss$IID, sep = "\t")
|
|
68
|
+
callrate.sample.fail = rownames(callrate.sample[
|
|
69
|
+
callrate.sample$Callrate < samplecr, , drop = FALSE
|
|
70
|
+
])
|
|
71
|
+
writeLines(callrate.sample.fail, con = file(paste0(iter_out, '.samplecr.fail')))
|
|
72
|
+
# append it to all_samplecr_fail_file
|
|
73
|
+
write(
|
|
74
|
+
paste0(sapply(
|
|
75
|
+
callrate.sample.fail,
|
|
76
|
+
function(x){ paste0(x, "\n") }
|
|
77
|
+
), collapse = ""),
|
|
78
|
+
file = file(all_samplecr_fail_file),
|
|
79
|
+
append = i > 1
|
|
80
|
+
)
|
|
81
|
+
|
|
82
|
+
vmiss <- read.table(
|
|
83
|
+
paste0(iter_out, '.vmiss'),
|
|
84
|
+
header = TRUE,
|
|
85
|
+
row.names = NULL,
|
|
86
|
+
check.names = FALSE,
|
|
87
|
+
comment.char = ""
|
|
88
|
+
)
|
|
89
|
+
vmiss$Iteration <- i
|
|
90
|
+
# append it to all_vmiss_file
|
|
91
|
+
write.table(
|
|
92
|
+
vmiss,
|
|
93
|
+
all_vmiss_file,
|
|
94
|
+
append = i > 1,
|
|
95
|
+
col.names = !file.exists(all_vmiss_file),
|
|
96
|
+
row.names = FALSE,
|
|
97
|
+
sep = "\t",
|
|
98
|
+
quote = FALSE
|
|
99
|
+
)
|
|
100
|
+
vmiss$Callrate <- 1 - vmiss$F_MISS
|
|
101
|
+
callrate.var.fail <- vmiss[which(vmiss$Callrate < varcr), 'ID', drop = TRUE]
|
|
102
|
+
writeLines(callrate.var.fail, con = file(paste0(iter_out, '.varcr.fail')))
|
|
103
|
+
# append it to all_varcr_fail_file
|
|
104
|
+
write(
|
|
105
|
+
paste0(sapply(
|
|
106
|
+
callrate.var.fail,
|
|
107
|
+
function(x){ paste0(x, "\n") }
|
|
108
|
+
), collapse = ""),
|
|
109
|
+
file = file(all_varcr_fail_file),
|
|
110
|
+
append = i > 1
|
|
111
|
+
)
|
|
112
|
+
|
|
113
|
+
if (length(callrate.sample.fail) == 0 && length(callrate.var.fail) == 0) {
|
|
114
|
+
# make symbolic links to output from input .bed, .bim and .fam files
|
|
115
|
+
file.symlink(paste0(input, '.bed'), paste0(output, '.bed'))
|
|
116
|
+
file.symlink(paste0(input, '.bim'), paste0(output, '.bim'))
|
|
117
|
+
file.symlink(paste0(input, '.fam'), paste0(output, '.fam'))
|
|
118
|
+
break
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
# remove samples in iter_out.samplecr.fail and variants in iter_out.varcr.fail
|
|
122
|
+
cmd <- c(
|
|
123
|
+
plink,
|
|
124
|
+
"--threads", ncores,
|
|
125
|
+
"--bfile", input,
|
|
126
|
+
"--remove", paste0(iter_out, '.samplecr.fail'),
|
|
127
|
+
"--exclude", paste0(iter_out, '.varcr.fail'),
|
|
128
|
+
"--make-bed",
|
|
129
|
+
"--out", iter_out
|
|
130
|
+
)
|
|
131
|
+
run_command(cmd, fg = TRUE)
|
|
132
|
+
input <- iter_out
|
|
133
|
+
}
|
|
134
|
+
|
|
135
|
+
smiss <- read.table(
|
|
136
|
+
smissfile,
|
|
137
|
+
header = TRUE,
|
|
138
|
+
row.names = NULL,
|
|
139
|
+
check.names = FALSE,
|
|
140
|
+
comment.char = ""
|
|
141
|
+
)
|
|
142
|
+
callrate.sample <- data.frame(Callrate = 1 - smiss$F_MISS)
|
|
143
|
+
rownames(callrate.sample) <- paste(smiss$FID, smiss$IID, sep = "\t")
|
|
144
|
+
|
|
145
|
+
vmiss <- read.table(
|
|
146
|
+
paste0(iter_out, '.vmiss'),
|
|
147
|
+
header = TRUE,
|
|
148
|
+
row.names = NULL,
|
|
149
|
+
check.names = FALSE,
|
|
150
|
+
comment.char = ""
|
|
151
|
+
)
|
|
152
|
+
vmiss$Callrate <- 1 - vmiss$F_MISS
|
|
153
|
+
|
|
154
|
+
if (doplot) {
|
|
155
|
+
log_info("Plotting ...")
|
|
156
|
+
callrate.sample$Status <- "Pass"
|
|
157
|
+
callrate.sample[callrate.sample.fail, "Status"] <- "Fail"
|
|
158
|
+
plotGG(
|
|
159
|
+
data = callrate.sample,
|
|
160
|
+
geom = "histogram",
|
|
161
|
+
outfile = paste0(output, '.samplecr.png'),
|
|
162
|
+
args = list(aes(fill = Status, x = Callrate), alpha = 0.8, bins = 50),
|
|
163
|
+
ggs = c(
|
|
164
|
+
'xlab("Sample Call Rate")',
|
|
165
|
+
'ylab("Count")',
|
|
166
|
+
'geom_vline(xintercept = samplecr, color = "red", linetype="dashed")',
|
|
167
|
+
'theme(legend.position = "none")',
|
|
168
|
+
'geom_text(aes(x = samplecr, y = Inf, label = samplecr), colour="red", angle=90, vjust = 1.2, hjust = 1.2)',
|
|
169
|
+
'scale_fill_manual(values = c("Pass" = "blue3", "Fail" = "red3"))'
|
|
170
|
+
)
|
|
171
|
+
)
|
|
172
|
+
|
|
173
|
+
vmiss$Status <- "Pass"
|
|
174
|
+
vmiss[which(vmiss$Callrate < varcr), "Status"] <- "Fail"
|
|
175
|
+
plotGG(
|
|
176
|
+
data = vmiss,
|
|
177
|
+
geom = "histogram",
|
|
178
|
+
outfile = paste0(output, '.varcr.png'),
|
|
179
|
+
args = list(aes(fill = Status, x = Callrate), alpha = 0.8, bins = 50),
|
|
180
|
+
ggs = c(
|
|
181
|
+
'xlab("Variant Call Rate")',
|
|
182
|
+
'ylab("Count")',
|
|
183
|
+
'geom_vline(xintercept = varcr, color = "red", linetype="dashed")',
|
|
184
|
+
'theme(legend.position = "none")',
|
|
185
|
+
'geom_text(aes(x = varcr, y = Inf, label = varcr), colour="red", angle=90, vjust = 1.2, hjust = 1.2)',
|
|
186
|
+
'scale_fill_manual(values = c("Pass" = "blue3", "Fail" = "red3"))'
|
|
187
|
+
),
|
|
188
|
+
devpars = devpars
|
|
189
|
+
)
|
|
190
|
+
}
|
|
@@ -0,0 +1,100 @@
|
|
|
1
|
+
"""Script for snp.PlinkFilter"""
|
|
2
|
+
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from biopipen.utils.misc import run_command, dict_to_cli_args, logger
|
|
5
|
+
|
|
6
|
+
indir = {{in.indir | repr}} # pyright: ignore # noqa: #999
|
|
7
|
+
samples_file = {{in.samples_file | repr}} # pyright: ignore
|
|
8
|
+
variants_file = {{in.variants_file | repr}} # pyright: ignore
|
|
9
|
+
outdir = {{out.outdir | repr}} # pyright: ignore
|
|
10
|
+
|
|
11
|
+
plink = {{envs.plink | repr}} # pyright: ignore
|
|
12
|
+
ncores = {{envs.ncores | repr}} # pyright: ignore
|
|
13
|
+
samples = {{envs.samples | repr}} # pyright: ignore
|
|
14
|
+
variants = {{envs.variants | repr}} # pyright: ignore
|
|
15
|
+
e_samples_file = {{envs.samples_file | repr}} # pyright: ignore
|
|
16
|
+
e_variants_file = {{envs.variants_file | repr}} # pyright: ignore
|
|
17
|
+
keep = {{envs.keep | repr}} # pyright: ignore
|
|
18
|
+
vfile_type = {{envs.vfile_type | repr}} # pyright: ignore
|
|
19
|
+
chr = {{envs.chr | repr}} # pyright: ignore
|
|
20
|
+
not_chr = {{envs.not_chr | repr}} # pyright: ignore
|
|
21
|
+
autosome = {{envs.autosome | repr}} # pyright: ignore
|
|
22
|
+
autosome_xy = {{envs.autosome_xy | repr}} # pyright: ignore
|
|
23
|
+
snps_only = {{envs.snps_only | repr}} # pyright: ignore
|
|
24
|
+
|
|
25
|
+
samples_file = samples_file or e_samples_file
|
|
26
|
+
if not samples_file and samples:
|
|
27
|
+
samples_file = Path(outdir) / "_samples.txt"
|
|
28
|
+
if isinstance(samples, str):
|
|
29
|
+
samples = [s.strip() for s in samples.split(",")]
|
|
30
|
+
|
|
31
|
+
with open(samples_file, "w") as fh:
|
|
32
|
+
fh.writelines(
|
|
33
|
+
[
|
|
34
|
+
line.replace("/", "\t") + "\n"
|
|
35
|
+
if "/" in line
|
|
36
|
+
else line + "\t" + line + "\n"
|
|
37
|
+
for line in samples
|
|
38
|
+
]
|
|
39
|
+
)
|
|
40
|
+
|
|
41
|
+
variants_file = variants_file or e_variants_file
|
|
42
|
+
if not variants_file and variants:
|
|
43
|
+
if vfile_type != "id":
|
|
44
|
+
logger.warning(
|
|
45
|
+
"envs.vfile_type should be 'id' if only envs.variants is provided."
|
|
46
|
+
)
|
|
47
|
+
vfile_type = "id"
|
|
48
|
+
|
|
49
|
+
variants_file = Path(outdir) / "_variants.txt"
|
|
50
|
+
if isinstance(variants, str):
|
|
51
|
+
variants = [v.strip() for v in variants.split(",")]
|
|
52
|
+
|
|
53
|
+
with open(variants_file, "w") as fh:
|
|
54
|
+
fh.writelines([line + "\n" for line in variants])
|
|
55
|
+
|
|
56
|
+
bedfile = list(Path(indir).glob("*.bed"))
|
|
57
|
+
if len(bedfile) == 0:
|
|
58
|
+
raise FileNotFoundError(f"No .bed file found in `in.indir`")
|
|
59
|
+
elif len(bedfile) > 1:
|
|
60
|
+
logger.warning(f"Multiple .bed files found in `in.indir`, using the first one.")
|
|
61
|
+
|
|
62
|
+
bedfile = bedfile[0]
|
|
63
|
+
input = bedfile.with_suffix("")
|
|
64
|
+
output = Path(outdir) / bedfile.stem
|
|
65
|
+
|
|
66
|
+
args = {
|
|
67
|
+
"": [plink],
|
|
68
|
+
"bfile": input,
|
|
69
|
+
"out": output,
|
|
70
|
+
"threads": ncores,
|
|
71
|
+
"make-bed": True,
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
if keep:
|
|
75
|
+
if samples_file:
|
|
76
|
+
args["keep"] = samples_file
|
|
77
|
+
if variants_file:
|
|
78
|
+
args["extract"] = (
|
|
79
|
+
variants_file if vfile_type == "id" else [vfile_type, variants_file]
|
|
80
|
+
)
|
|
81
|
+
else:
|
|
82
|
+
if samples_file:
|
|
83
|
+
args["remove"] = samples_file
|
|
84
|
+
if variants_file:
|
|
85
|
+
args["exclude"] = (
|
|
86
|
+
variants_file if vfile_type == "id" else [vfile_type, variants_file]
|
|
87
|
+
)
|
|
88
|
+
|
|
89
|
+
if chr:
|
|
90
|
+
args["chr"] = chr
|
|
91
|
+
if not_chr:
|
|
92
|
+
args["not_chr"] = not_chr
|
|
93
|
+
if autosome:
|
|
94
|
+
args["autosome"] = True
|
|
95
|
+
if autosome_xy:
|
|
96
|
+
args["autosome"] = True
|
|
97
|
+
if snps_only:
|
|
98
|
+
args["snps_only"] = snps_only
|
|
99
|
+
|
|
100
|
+
run_command(dict_to_cli_args(args, dashify=True, dup_key=False), fg=True)
|