biopipen 0.28.1__py3-none-any.whl → 0.29.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of biopipen might be problematic. Click here for more details.
- biopipen/__init__.py +1 -1
- biopipen/core/config.toml +8 -0
- biopipen/ns/bam.py +0 -2
- biopipen/ns/bed.py +35 -0
- biopipen/ns/cellranger_pipeline.py +5 -5
- biopipen/ns/cnv.py +18 -2
- biopipen/ns/cnvkit_pipeline.py +16 -11
- biopipen/ns/gene.py +68 -23
- biopipen/ns/misc.py +2 -15
- biopipen/ns/plot.py +204 -0
- biopipen/ns/regulatory.py +214 -0
- biopipen/ns/scrna.py +31 -5
- biopipen/ns/snp.py +516 -8
- biopipen/ns/stats.py +167 -3
- biopipen/ns/vcf.py +196 -0
- biopipen/reports/snp/PlinkCallRate.svelte +24 -0
- biopipen/reports/snp/PlinkFreq.svelte +18 -0
- biopipen/reports/snp/PlinkHWE.svelte +18 -0
- biopipen/reports/snp/PlinkHet.svelte +18 -0
- biopipen/reports/snp/PlinkIBD.svelte +18 -0
- biopipen/scripts/bam/CNVpytor.py +144 -46
- biopipen/scripts/bed/BedtoolsIntersect.py +54 -0
- biopipen/scripts/bed/BedtoolsMerge.py +1 -1
- biopipen/scripts/cnv/AneuploidyScore.R +30 -7
- biopipen/scripts/cnv/AneuploidyScoreSummary.R +5 -2
- biopipen/scripts/cnv/TMADScore.R +21 -5
- biopipen/scripts/cnv/TMADScoreSummary.R +6 -2
- biopipen/scripts/cnvkit/CNVkitAccess.py +2 -1
- biopipen/scripts/cnvkit/CNVkitAutobin.py +3 -2
- biopipen/scripts/cnvkit/CNVkitBatch.py +1 -1
- biopipen/scripts/cnvkit/CNVkitCoverage.py +2 -1
- biopipen/scripts/cnvkit/CNVkitGuessBaits.py +1 -1
- biopipen/scripts/cnvkit/CNVkitHeatmap.py +1 -1
- biopipen/scripts/cnvkit/CNVkitReference.py +2 -1
- biopipen/scripts/delim/SampleInfo.R +10 -5
- biopipen/scripts/gene/GeneNameConversion.R +65 -0
- biopipen/scripts/gene/GenePromoters.R +61 -0
- biopipen/scripts/misc/Shell.sh +15 -0
- biopipen/scripts/plot/Manhattan.R +146 -0
- biopipen/scripts/plot/QQPlot.R +146 -0
- biopipen/scripts/regulatory/MotifAffinityTest.R +226 -0
- biopipen/scripts/regulatory/MotifAffinityTest_AtSNP.R +126 -0
- biopipen/scripts/regulatory/MotifAffinityTest_MotifBreakR.R +96 -0
- biopipen/scripts/regulatory/MotifScan.py +159 -0
- biopipen/scripts/regulatory/atSNP.R +33 -0
- biopipen/scripts/regulatory/motifBreakR.R +1594 -0
- biopipen/scripts/scrna/MarkersFinder.R +69 -67
- biopipen/scripts/scrna/SeuratClustering.R +71 -29
- biopipen/scripts/scrna/SeuratMap2Ref.R +20 -0
- biopipen/scripts/scrna/SeuratPreparing.R +252 -122
- biopipen/scripts/scrna/SeuratSubClustering.R +76 -27
- biopipen/scripts/snp/MatrixEQTL.R +85 -44
- biopipen/scripts/snp/Plink2GTMat.py +133 -0
- biopipen/scripts/snp/PlinkCallRate.R +190 -0
- biopipen/scripts/snp/PlinkFilter.py +100 -0
- biopipen/scripts/snp/PlinkFreq.R +298 -0
- biopipen/scripts/snp/PlinkFromVcf.py +78 -0
- biopipen/scripts/snp/PlinkHWE.R +80 -0
- biopipen/scripts/snp/PlinkHet.R +92 -0
- biopipen/scripts/snp/PlinkIBD.R +200 -0
- biopipen/scripts/snp/PlinkUpdateName.py +124 -0
- biopipen/scripts/stats/Mediation.R +94 -0
- biopipen/scripts/stats/MetaPvalue.R +2 -1
- biopipen/scripts/stats/MetaPvalue1.R +70 -0
- biopipen/scripts/tcr/TCRClusterStats.R +12 -7
- biopipen/scripts/vcf/BcftoolsAnnotate.py +91 -0
- biopipen/scripts/vcf/BcftoolsFilter.py +90 -0
- biopipen/scripts/vcf/BcftoolsSort.py +113 -0
- biopipen/scripts/vcf/BcftoolsView.py +73 -0
- biopipen/scripts/vcf/VcfFix_utils.py +1 -1
- biopipen/scripts/vcf/bcftools_utils.py +52 -0
- biopipen/utils/gene.R +83 -37
- biopipen/utils/gene.py +108 -60
- biopipen/utils/misc.R +56 -0
- biopipen/utils/misc.py +5 -2
- biopipen/utils/reference.py +54 -10
- {biopipen-0.28.1.dist-info → biopipen-0.29.1.dist-info}/METADATA +2 -2
- {biopipen-0.28.1.dist-info → biopipen-0.29.1.dist-info}/RECORD +80 -51
- {biopipen-0.28.1.dist-info → biopipen-0.29.1.dist-info}/entry_points.txt +1 -1
- biopipen/ns/bcftools.py +0 -111
- biopipen/scripts/bcftools/BcftoolsAnnotate.py +0 -42
- biopipen/scripts/bcftools/BcftoolsFilter.py +0 -79
- biopipen/scripts/bcftools/BcftoolsSort.py +0 -19
- biopipen/scripts/gene/GeneNameConversion.py +0 -66
- {biopipen-0.28.1.dist-info → biopipen-0.29.1.dist-info}/WHEEL +0 -0
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
source("{{biopipen_dir}}/utils/misc.R")
|
|
2
2
|
library(rlang)
|
|
3
|
+
library(rtracklayer)
|
|
3
4
|
library(MatrixEQTL)
|
|
4
5
|
|
|
5
6
|
snpfile = {{in.geno | r}}
|
|
@@ -11,6 +12,7 @@ outfile = {{out.cisqtls | r}}
|
|
|
11
12
|
|
|
12
13
|
model = {{envs.model | r}}
|
|
13
14
|
pval = {{envs.pval | r}}
|
|
15
|
+
match_samples = {{envs.match_samples | r}}
|
|
14
16
|
transp = {{envs.transp | r}}
|
|
15
17
|
fdr = {{envs.fdr | r}}
|
|
16
18
|
snppos = {{envs.snppos | r}}
|
|
@@ -36,7 +38,9 @@ if (!trans_enabled && !cis_enabled) {
|
|
|
36
38
|
transp <- 1e-5
|
|
37
39
|
}
|
|
38
40
|
|
|
39
|
-
transpose_file <- function(file) {
|
|
41
|
+
transpose_file <- function(file, what) {
|
|
42
|
+
if (is.null(file)) return(NULL)
|
|
43
|
+
log_info("Transposing {what} file ...")
|
|
40
44
|
out <- file.path(joboutdir, paste0(
|
|
41
45
|
tools::file_path_sans_ext(basename(file)),
|
|
42
46
|
".transposed.",
|
|
@@ -47,10 +51,11 @@ transpose_file <- function(file) {
|
|
|
47
51
|
out
|
|
48
52
|
}
|
|
49
53
|
|
|
50
|
-
if (transpose_geno) snpfile = transpose_file(snpfile)
|
|
51
|
-
if (transpose_expr) expfile = transpose_file(expfile)
|
|
52
|
-
if (transpose_cov) covfile = transpose_file(covfile)
|
|
54
|
+
if (transpose_geno) snpfile = transpose_file(snpfile, "geno")
|
|
55
|
+
if (transpose_expr) expfile = transpose_file(expfile, "expr")
|
|
56
|
+
if (transpose_cov) covfile = transpose_file(covfile, "cov")
|
|
53
57
|
|
|
58
|
+
log_info("Loading SNP data ...")
|
|
54
59
|
snps = SlicedData$new();
|
|
55
60
|
snps$fileDelimiter = "\t"; # the TAB character
|
|
56
61
|
snps$fileOmitCharacters = "NA"; # denote missing values;
|
|
@@ -59,6 +64,7 @@ snps$fileSkipColumns = 1; # one column of row labels
|
|
|
59
64
|
snps$fileSliceSize = 10000; # read file in pieces of 2,000 rows
|
|
60
65
|
snps$LoadFile( snpfile );
|
|
61
66
|
|
|
67
|
+
log_info("Loading gene expression data ...")
|
|
62
68
|
gene = SlicedData$new();
|
|
63
69
|
gene$fileDelimiter = "\t"; # the TAB character
|
|
64
70
|
gene$fileOmitCharacters = "NA"; # denote missing values;
|
|
@@ -69,16 +75,39 @@ gene$LoadFile( expfile );
|
|
|
69
75
|
|
|
70
76
|
cvrt = SlicedData$new();
|
|
71
77
|
if (!is.null(covfile) && file.exists(covfile)) {
|
|
72
|
-
|
|
78
|
+
log_info("Loading covariate data ...")
|
|
79
|
+
covmatrix = read.table(covfile, header=TRUE, stringsAsFactors=FALSE, row.names=1, sep="\t", quote="", check.names=FALSE)
|
|
73
80
|
cvrt$CreateFromMatrix( as.matrix(covmatrix) )
|
|
74
81
|
}
|
|
75
82
|
|
|
83
|
+
log_info("Matching samples ...")
|
|
84
|
+
if (match_samples) {
|
|
85
|
+
# let matrixEQTL raise an error if samples do not match
|
|
86
|
+
} else {
|
|
87
|
+
n_sample_snps = snps$nCols()
|
|
88
|
+
n_sample_gene = gene$nCols()
|
|
89
|
+
common_samples = intersect(snps$columnNames, gene$columnNames)
|
|
90
|
+
if (!is.null(covfile)) {
|
|
91
|
+
common_samples = intersect(common_samples, cvrt$columnNames)
|
|
92
|
+
n_sample_cov = cvrt$nCols()
|
|
93
|
+
cvrt = cvrt$ColumnSubsample(match(common_samples, cvrt$columnNames))
|
|
94
|
+
}
|
|
95
|
+
snps = snps$ColumnSubsample(match(common_samples, snps$columnNames))
|
|
96
|
+
gene = gene$ColumnSubsample(match(common_samples, gene$columnNames))
|
|
97
|
+
log_info("- Samples used in SNP data: {n_sample_snps} -> {snps$nCols()}")
|
|
98
|
+
log_info("- Samples used in gene expression data: {n_sample_gene} -> {gene$nCols()}")
|
|
99
|
+
if (!is.null(covfile)) {
|
|
100
|
+
log_info("- Samples used in covariate data: {n_sample_cov} -> {cvrt$nCols()}")
|
|
101
|
+
}
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
log_info("Composing engine parameters ...")
|
|
76
105
|
engine_params = list()
|
|
77
106
|
engine_params$snps = snps
|
|
78
107
|
engine_params$gene = gene
|
|
79
108
|
engine_params$cvrt = cvrt
|
|
80
|
-
engine_params$output_file_name =
|
|
81
|
-
engine_params$pvOutputThreshold =
|
|
109
|
+
engine_params$output_file_name = if(trans_enabled) alleqtl else NULL
|
|
110
|
+
engine_params$pvOutputThreshold = if(trans_enabled) min(transp, 1) else 0
|
|
82
111
|
engine_params$useModel = model
|
|
83
112
|
engine_params$errorCovariance = numeric()
|
|
84
113
|
engine_params$verbose = TRUE
|
|
@@ -89,66 +118,78 @@ noq = function(s) {
|
|
|
89
118
|
}
|
|
90
119
|
|
|
91
120
|
if (cis_enabled) {
|
|
121
|
+
log_info("Loading SNP positions ...")
|
|
92
122
|
if (endsWith(snppos, ".bed")) {
|
|
93
|
-
snppos_data = read.table
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
123
|
+
snppos_data = read.table(snppos, header = FALSE, stringsAsFactors = FALSE, sep = "\t")
|
|
124
|
+
snppos_data = data.frame(
|
|
125
|
+
snp = snppos_data$V4,
|
|
126
|
+
chr = snppos_data$V1,
|
|
127
|
+
pos = snppos_data$V3
|
|
128
|
+
)
|
|
97
129
|
} else if (endsWith(snppos, ".gff") || endsWith(snppos, ".gtf")) {
|
|
98
|
-
snppos_data =
|
|
99
|
-
|
|
100
|
-
snppos_data =
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
return(noq(substring(s, 8)))
|
|
106
|
-
else if (startsWith(s, "rs_id "))
|
|
107
|
-
return(noq(substring(s, 7)))
|
|
108
|
-
else if (startsWith(s, "rs "))
|
|
109
|
-
return(noq(substring(s, 4)))
|
|
110
|
-
}
|
|
111
|
-
}))
|
|
130
|
+
snppos_data = import(snppos)
|
|
131
|
+
elem_meta = elementMetadata(snppos_data)
|
|
132
|
+
snppos_data = data.frame(
|
|
133
|
+
snp = elem_meta$snp_id %||% elem_meta$rs_id %||% elem_meta$rs,
|
|
134
|
+
chr = as.character(seqnames(snppos_data)),
|
|
135
|
+
pos = start(snppos_data)
|
|
136
|
+
)
|
|
112
137
|
} else if (endsWith(snppos, ".vcf") || endsWith(snppos, ".vcf.gz")) {
|
|
113
|
-
snppos_data = read.table
|
|
114
|
-
|
|
138
|
+
snppos_data = read.table(
|
|
139
|
+
snppos,
|
|
140
|
+
header=FALSE,
|
|
141
|
+
row.names=NULL,
|
|
142
|
+
stringsAsFactors=FALSE,
|
|
143
|
+
check.names=FALSE
|
|
144
|
+
)
|
|
115
145
|
snppos_data = snppos_data[, c(3, 1, 2)]
|
|
116
146
|
colnames(snppos_data) = c("snp", "chr", "pos")
|
|
117
147
|
} else {
|
|
118
|
-
snppos_data = read.table
|
|
148
|
+
snppos_data = read.table(
|
|
149
|
+
snppos,
|
|
150
|
+
header=FALSE,
|
|
151
|
+
row.names=NULL,
|
|
152
|
+
stringsAsFactors=FALSE,
|
|
153
|
+
check.names=FALSE
|
|
154
|
+
)
|
|
119
155
|
colnames(snppos_data) = c("snp", "chr", "pos")
|
|
120
156
|
}
|
|
121
157
|
|
|
158
|
+
log_info("Loading gene positions ...")
|
|
122
159
|
if (endsWith(genepos, ".bed")) {
|
|
123
|
-
genepos_data = read.table
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
160
|
+
genepos_data = read.table(genepos, header = FALSE, stringsAsFactors = FALSE, sep = "\t")
|
|
161
|
+
genepos_data = data.frame(
|
|
162
|
+
geneid = genepos_data$V4,
|
|
163
|
+
chr = genepos_data$V1,
|
|
164
|
+
s1 = genepos_data$V2,
|
|
165
|
+
s2 = genepos_data$V3
|
|
166
|
+
)
|
|
127
167
|
} else if (endsWith(genepos, ".gff") || endsWith(genepos, ".gtf")) {
|
|
128
|
-
genepos_data =
|
|
129
|
-
|
|
130
|
-
genepos_data =
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
}
|
|
137
|
-
})))
|
|
168
|
+
genepos_data = import(genepos)
|
|
169
|
+
elem_meta = elementMetadata(genepos_data)
|
|
170
|
+
genepos_data = data.frame(
|
|
171
|
+
geneid = elem_meta$gene_id %||% elem_meta$gene_name,
|
|
172
|
+
chr = as.character(seqnames(genepos_data)),
|
|
173
|
+
s1 = start(genepos_data),
|
|
174
|
+
s2 = end(genepos_data)
|
|
175
|
+
)
|
|
138
176
|
} else {
|
|
139
177
|
genepos_data = read.table(genepos, header = TRUE, stringsAsFactors = FALSE);
|
|
140
178
|
colnames(genepos_data) = c("geneid", "chr", "s1", "s2")
|
|
141
179
|
}
|
|
142
180
|
|
|
181
|
+
log_info("Running MatrixEQTL with cis-eQTLs enabled ...")
|
|
143
182
|
engine_params$output_file_name.cis = outfile
|
|
144
|
-
engine_params$pvOutputThreshold.cis = pval
|
|
183
|
+
engine_params$pvOutputThreshold.cis = min(pval, 1)
|
|
145
184
|
engine_params$cisDist = dist
|
|
146
185
|
engine_params$snpspos = snppos_data
|
|
147
186
|
engine_params$genepos = genepos_data
|
|
148
187
|
do_call(Matrix_eQTL_main, engine_params)
|
|
188
|
+
if (!file.exists(alleqtl)) file.create(alleqtl)
|
|
149
189
|
} else {
|
|
190
|
+
log_info("Running MatrixEQTL without cis-eQTLs ...")
|
|
150
191
|
do_call(Matrix_eQTL_engine, engine_params)
|
|
151
|
-
file.create(outfile)
|
|
192
|
+
if (!file.exists(outfile)) file.create(outfile)
|
|
152
193
|
}
|
|
153
194
|
|
|
154
195
|
if (pval == 0) {
|
|
@@ -0,0 +1,133 @@
|
|
|
1
|
+
|
|
2
|
+
from os import path
|
|
3
|
+
from glob import glob
|
|
4
|
+
from biopipen.utils.misc import run_command, logger
|
|
5
|
+
|
|
6
|
+
indir = {{in.indir | repr}} # noqa: E999 # pyright: ignore
|
|
7
|
+
outfile = {{out.outfile | repr}} # pyright: ignore
|
|
8
|
+
plink = {{envs.plink | repr}} # pyright: ignore
|
|
9
|
+
ncores = {{envs.ncores | repr}} # pyright: ignore
|
|
10
|
+
transpose = {{envs.transpose | repr}} # pyright: ignore
|
|
11
|
+
samid = {{envs.samid | repr}} # pyright: ignore
|
|
12
|
+
varid = {{envs.varid | repr}} # pyright: ignore
|
|
13
|
+
trans_chr = {{envs.trans_chr | repr}} # pyright: ignore
|
|
14
|
+
missing_id = {{envs.missing_id | repr}} # pyright: ignore
|
|
15
|
+
trans_chr = trans_chr or {}
|
|
16
|
+
|
|
17
|
+
bedfile = glob(path.join(indir, '*.bed'))
|
|
18
|
+
if len(bedfile) == 0:
|
|
19
|
+
raise FileNotFoundError(f"No .bed file found in `in.indir`")
|
|
20
|
+
elif len(bedfile) > 1:
|
|
21
|
+
logger.warning(f"Multiple .bed files found in `in.indir`, using the first one.")
|
|
22
|
+
|
|
23
|
+
bedfile = bedfile[0]
|
|
24
|
+
input = path.splitext(bedfile)[0]
|
|
25
|
+
output = path.splitext(outfile)[0]
|
|
26
|
+
|
|
27
|
+
cmd = [
|
|
28
|
+
plink,
|
|
29
|
+
"--bfile", input,
|
|
30
|
+
"--out", output,
|
|
31
|
+
"--threads", ncores,
|
|
32
|
+
"--keep-allele-order",
|
|
33
|
+
"--recode", "A-transpose" if not transpose else "A",
|
|
34
|
+
]
|
|
35
|
+
# if transpose:
|
|
36
|
+
# cmd += ["tabx"]
|
|
37
|
+
|
|
38
|
+
run_command(cmd, fg=True, env={"cwd": path.dirname(outfile)})
|
|
39
|
+
|
|
40
|
+
if not transpose: # rows are variants, columns are samples
|
|
41
|
+
# .traw file is created, tab-separated, with the following columns:
|
|
42
|
+
trawfile = output + ".traw"
|
|
43
|
+
# CHR Chromosome code
|
|
44
|
+
# SNP Variant identifier
|
|
45
|
+
# (C)M Position in morgans or centimorgans
|
|
46
|
+
# POS Base-pair coordinate
|
|
47
|
+
# COUNTED Counted allele (defaults to A1), the actual alternative allele
|
|
48
|
+
# with --keep-allele-order
|
|
49
|
+
# ALT Other allele(s), comma-separated, the actual reference allele
|
|
50
|
+
# <FID>_<IID>... Allelic dosages
|
|
51
|
+
# (0/1/2/'NA' for diploid variants, 0/2/'NA' for haploid)
|
|
52
|
+
with open(trawfile, 'r') as fin:
|
|
53
|
+
with open(outfile, 'w') as fout:
|
|
54
|
+
samples = fin.readline().strip().split('\t')[6:]
|
|
55
|
+
header = ["Variant"]
|
|
56
|
+
for sam in samples:
|
|
57
|
+
try:
|
|
58
|
+
fid, iid = sam.split('_')
|
|
59
|
+
except ValueError:
|
|
60
|
+
raise ValueError(
|
|
61
|
+
f"Can't determine FID and IID from sample ID: {sam}, "
|
|
62
|
+
f"extra underscore (_) detected."
|
|
63
|
+
) from None
|
|
64
|
+
sam = samid.replace('{fid}', fid).replace('{iid}', iid)
|
|
65
|
+
header.append(sam)
|
|
66
|
+
fout.write('\t'.join(header) + '\n')
|
|
67
|
+
|
|
68
|
+
for line in fin:
|
|
69
|
+
line = line.strip().split('\t')
|
|
70
|
+
chrom = trans_chr.get(line[0], line[0])
|
|
71
|
+
var = line[1]
|
|
72
|
+
if var == "." or var == "":
|
|
73
|
+
var = missing_id
|
|
74
|
+
pos = line[3]
|
|
75
|
+
ref = line[5]
|
|
76
|
+
alt = line[4]
|
|
77
|
+
variant = (
|
|
78
|
+
varid
|
|
79
|
+
.replace('{chr}', chrom)
|
|
80
|
+
.replace('{varid}', var)
|
|
81
|
+
.replace('{pos}', pos)
|
|
82
|
+
.replace('{ref}', ref)
|
|
83
|
+
.replace('{alt}', alt)
|
|
84
|
+
)
|
|
85
|
+
record = [variant] + line[6:]
|
|
86
|
+
fout.write('\t'.join(record) + '\n')
|
|
87
|
+
|
|
88
|
+
else:
|
|
89
|
+
# .raw file is created, tab-separated, with the following columns:
|
|
90
|
+
rawfile = output + ".raw"
|
|
91
|
+
# FID Family ID
|
|
92
|
+
# IID Individual ID
|
|
93
|
+
# PAT Paternal ID
|
|
94
|
+
# MAT Maternal ID
|
|
95
|
+
# SEX Sex (1 = male, 2 = female, 0 = unknown)
|
|
96
|
+
# PHENOTYPE Main phenotype value
|
|
97
|
+
# <VariantID>... Allelic dosage (0/1/2/NA for diploid variants, 0/2/NA for haploid)
|
|
98
|
+
#
|
|
99
|
+
# Variant information may not be included in <VariantID>
|
|
100
|
+
# We use the .bim file to get the variant information
|
|
101
|
+
bimfile = input + ".bim"
|
|
102
|
+
with open(rawfile, 'r') as fin:
|
|
103
|
+
with open(outfile, 'w') as fout:
|
|
104
|
+
header = ["Sample"]
|
|
105
|
+
with open(bimfile, 'r') as fbim:
|
|
106
|
+
for line in fbim:
|
|
107
|
+
line = line.strip().split('\t')
|
|
108
|
+
chrom = trans_chr.get(line[0], line[0])
|
|
109
|
+
var = line[1]
|
|
110
|
+
if var == "." or var == "":
|
|
111
|
+
var = missing_id
|
|
112
|
+
pos = line[3]
|
|
113
|
+
ref = line[5]
|
|
114
|
+
alt = line[4]
|
|
115
|
+
variant = (
|
|
116
|
+
varid
|
|
117
|
+
.replace('{chr}', chrom)
|
|
118
|
+
.replace('{varid}', var)
|
|
119
|
+
.replace('{pos}', pos)
|
|
120
|
+
.replace('{ref}', ref)
|
|
121
|
+
.replace('{alt}', alt)
|
|
122
|
+
)
|
|
123
|
+
header.append(variant)
|
|
124
|
+
fout.write('\t'.join(header) + '\n')
|
|
125
|
+
|
|
126
|
+
next(fin) # skip header
|
|
127
|
+
for line in fin:
|
|
128
|
+
line = line.strip().split('\t')
|
|
129
|
+
fid = line[0]
|
|
130
|
+
iid = line[1]
|
|
131
|
+
sam = samid.replace('{fid}', fid).replace('{iid}', iid)
|
|
132
|
+
record = [sam] + line[6:]
|
|
133
|
+
fout.write('\t'.join(record) + '\n')
|
|
@@ -0,0 +1,190 @@
|
|
|
1
|
+
source("{{biopipen_dir}}/utils/misc.R")
|
|
2
|
+
source("{{biopipen_dir}}/utils/plot.R")
|
|
3
|
+
library(ggprism)
|
|
4
|
+
theme_set(theme_prism())
|
|
5
|
+
|
|
6
|
+
indir <- {{in.indir | r}}
|
|
7
|
+
outdir <- {{out.outdir | r}}
|
|
8
|
+
plink <- {{envs.plink | r}}
|
|
9
|
+
ncores <- {{envs.ncores | r}}
|
|
10
|
+
doplot <- {{envs.plot | r}}
|
|
11
|
+
devpars <- {{envs.devpars | r}}
|
|
12
|
+
samplecr <- {{envs.samplecr | r}}
|
|
13
|
+
varcr <- {{envs.varcr | r}}
|
|
14
|
+
max_iter <- {{envs.max_iter | r}}
|
|
15
|
+
|
|
16
|
+
bedfile = Sys.glob(file.path(indir, '*.bed'))
|
|
17
|
+
if (length(bedfile) == 0)
|
|
18
|
+
stop("No bed files found in the input directory.")
|
|
19
|
+
if (length(bedfile) > 1) {
|
|
20
|
+
log_warn("Multiple bed files found in the input directory. Using the first one.")
|
|
21
|
+
bedfile <- bedfile[1]
|
|
22
|
+
}
|
|
23
|
+
input <- tools::file_path_sans_ext(bedfile)
|
|
24
|
+
output <- file.path(outdir, basename(input))
|
|
25
|
+
|
|
26
|
+
all_smiss_file = paste0(output, '.smiss')
|
|
27
|
+
all_vmiss_file = paste0(output, '.vmiss')
|
|
28
|
+
all_samplecr_fail_file = paste0(output, '.samplecr.fail')
|
|
29
|
+
all_varcr_fail_file = paste0(output, '.varcr.fail')
|
|
30
|
+
if (file.exists(all_smiss_file)) invisible(file.remove(all_smiss_file))
|
|
31
|
+
if (file.exists(all_vmiss_file)) invisible(file.remove(all_vmiss_file))
|
|
32
|
+
for (i in 1:max_iter) {
|
|
33
|
+
log_info("Iteration {i} ...")
|
|
34
|
+
# iter_out <- paste0(output, "-", i)
|
|
35
|
+
iter_dir <- file.path(outdir, paste0("iter", i))
|
|
36
|
+
dir.create(iter_dir, showWarnings = FALSE)
|
|
37
|
+
iter_out <- file.path(iter_dir, basename(output))
|
|
38
|
+
cmd <- c(
|
|
39
|
+
plink,
|
|
40
|
+
"--threads", ncores,
|
|
41
|
+
"--bfile", input,
|
|
42
|
+
"--missing",
|
|
43
|
+
"--out", iter_out
|
|
44
|
+
)
|
|
45
|
+
run_command(cmd, fg = TRUE)
|
|
46
|
+
|
|
47
|
+
smissfile <- paste0(iter_out, '.smiss')
|
|
48
|
+
smiss <- read.table(
|
|
49
|
+
smissfile,
|
|
50
|
+
header = TRUE,
|
|
51
|
+
row.names = NULL,
|
|
52
|
+
check.names = FALSE,
|
|
53
|
+
comment.char = ""
|
|
54
|
+
)
|
|
55
|
+
smiss$Iteration <- i
|
|
56
|
+
# append it to all_smiss_file
|
|
57
|
+
write.table(
|
|
58
|
+
smiss,
|
|
59
|
+
all_smiss_file,
|
|
60
|
+
append = i > 1,
|
|
61
|
+
col.names = !file.exists(all_smiss_file),
|
|
62
|
+
row.names = FALSE,
|
|
63
|
+
sep = "\t",
|
|
64
|
+
quote = FALSE
|
|
65
|
+
)
|
|
66
|
+
callrate.sample <- data.frame(Callrate = 1 - smiss$F_MISS)
|
|
67
|
+
rownames(callrate.sample) <- paste(smiss$FID, smiss$IID, sep = "\t")
|
|
68
|
+
callrate.sample.fail = rownames(callrate.sample[
|
|
69
|
+
callrate.sample$Callrate < samplecr, , drop = FALSE
|
|
70
|
+
])
|
|
71
|
+
writeLines(callrate.sample.fail, con = file(paste0(iter_out, '.samplecr.fail')))
|
|
72
|
+
# append it to all_samplecr_fail_file
|
|
73
|
+
write(
|
|
74
|
+
paste0(sapply(
|
|
75
|
+
callrate.sample.fail,
|
|
76
|
+
function(x){ paste0(x, "\n") }
|
|
77
|
+
), collapse = ""),
|
|
78
|
+
file = file(all_samplecr_fail_file),
|
|
79
|
+
append = i > 1
|
|
80
|
+
)
|
|
81
|
+
|
|
82
|
+
vmiss <- read.table(
|
|
83
|
+
paste0(iter_out, '.vmiss'),
|
|
84
|
+
header = TRUE,
|
|
85
|
+
row.names = NULL,
|
|
86
|
+
check.names = FALSE,
|
|
87
|
+
comment.char = ""
|
|
88
|
+
)
|
|
89
|
+
vmiss$Iteration <- i
|
|
90
|
+
# append it to all_vmiss_file
|
|
91
|
+
write.table(
|
|
92
|
+
vmiss,
|
|
93
|
+
all_vmiss_file,
|
|
94
|
+
append = i > 1,
|
|
95
|
+
col.names = !file.exists(all_vmiss_file),
|
|
96
|
+
row.names = FALSE,
|
|
97
|
+
sep = "\t",
|
|
98
|
+
quote = FALSE
|
|
99
|
+
)
|
|
100
|
+
vmiss$Callrate <- 1 - vmiss$F_MISS
|
|
101
|
+
callrate.var.fail <- vmiss[which(vmiss$Callrate < varcr), 'ID', drop = TRUE]
|
|
102
|
+
writeLines(callrate.var.fail, con = file(paste0(iter_out, '.varcr.fail')))
|
|
103
|
+
# append it to all_varcr_fail_file
|
|
104
|
+
write(
|
|
105
|
+
paste0(sapply(
|
|
106
|
+
callrate.var.fail,
|
|
107
|
+
function(x){ paste0(x, "\n") }
|
|
108
|
+
), collapse = ""),
|
|
109
|
+
file = file(all_varcr_fail_file),
|
|
110
|
+
append = i > 1
|
|
111
|
+
)
|
|
112
|
+
|
|
113
|
+
if (length(callrate.sample.fail) == 0 && length(callrate.var.fail) == 0) {
|
|
114
|
+
# make symbolic links to output from input .bed, .bim and .fam files
|
|
115
|
+
file.symlink(paste0(input, '.bed'), paste0(output, '.bed'))
|
|
116
|
+
file.symlink(paste0(input, '.bim'), paste0(output, '.bim'))
|
|
117
|
+
file.symlink(paste0(input, '.fam'), paste0(output, '.fam'))
|
|
118
|
+
break
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
# remove samples in iter_out.samplecr.fail and variants in iter_out.varcr.fail
|
|
122
|
+
cmd <- c(
|
|
123
|
+
plink,
|
|
124
|
+
"--threads", ncores,
|
|
125
|
+
"--bfile", input,
|
|
126
|
+
"--remove", paste0(iter_out, '.samplecr.fail'),
|
|
127
|
+
"--exclude", paste0(iter_out, '.varcr.fail'),
|
|
128
|
+
"--make-bed",
|
|
129
|
+
"--out", iter_out
|
|
130
|
+
)
|
|
131
|
+
run_command(cmd, fg = TRUE)
|
|
132
|
+
input <- iter_out
|
|
133
|
+
}
|
|
134
|
+
|
|
135
|
+
smiss <- read.table(
|
|
136
|
+
smissfile,
|
|
137
|
+
header = TRUE,
|
|
138
|
+
row.names = NULL,
|
|
139
|
+
check.names = FALSE,
|
|
140
|
+
comment.char = ""
|
|
141
|
+
)
|
|
142
|
+
callrate.sample <- data.frame(Callrate = 1 - smiss$F_MISS)
|
|
143
|
+
rownames(callrate.sample) <- paste(smiss$FID, smiss$IID, sep = "\t")
|
|
144
|
+
|
|
145
|
+
vmiss <- read.table(
|
|
146
|
+
paste0(iter_out, '.vmiss'),
|
|
147
|
+
header = TRUE,
|
|
148
|
+
row.names = NULL,
|
|
149
|
+
check.names = FALSE,
|
|
150
|
+
comment.char = ""
|
|
151
|
+
)
|
|
152
|
+
vmiss$Callrate <- 1 - vmiss$F_MISS
|
|
153
|
+
|
|
154
|
+
if (doplot) {
|
|
155
|
+
log_info("Plotting ...")
|
|
156
|
+
callrate.sample$Status <- "Pass"
|
|
157
|
+
callrate.sample[callrate.sample.fail, "Status"] <- "Fail"
|
|
158
|
+
plotGG(
|
|
159
|
+
data = callrate.sample,
|
|
160
|
+
geom = "histogram",
|
|
161
|
+
outfile = paste0(output, '.samplecr.png'),
|
|
162
|
+
args = list(aes(fill = Status, x = Callrate), alpha = 0.8, bins = 50),
|
|
163
|
+
ggs = c(
|
|
164
|
+
'xlab("Sample Call Rate")',
|
|
165
|
+
'ylab("Count")',
|
|
166
|
+
'geom_vline(xintercept = samplecr, color = "red", linetype="dashed")',
|
|
167
|
+
'theme(legend.position = "none")',
|
|
168
|
+
'geom_text(aes(x = samplecr, y = Inf, label = samplecr), colour="red", angle=90, vjust = 1.2, hjust = 1.2)',
|
|
169
|
+
'scale_fill_manual(values = c("Pass" = "blue3", "Fail" = "red3"))'
|
|
170
|
+
)
|
|
171
|
+
)
|
|
172
|
+
|
|
173
|
+
vmiss$Status <- "Pass"
|
|
174
|
+
vmiss[which(vmiss$Callrate < varcr), "Status"] <- "Fail"
|
|
175
|
+
plotGG(
|
|
176
|
+
data = vmiss,
|
|
177
|
+
geom = "histogram",
|
|
178
|
+
outfile = paste0(output, '.varcr.png'),
|
|
179
|
+
args = list(aes(fill = Status, x = Callrate), alpha = 0.8, bins = 50),
|
|
180
|
+
ggs = c(
|
|
181
|
+
'xlab("Variant Call Rate")',
|
|
182
|
+
'ylab("Count")',
|
|
183
|
+
'geom_vline(xintercept = varcr, color = "red", linetype="dashed")',
|
|
184
|
+
'theme(legend.position = "none")',
|
|
185
|
+
'geom_text(aes(x = varcr, y = Inf, label = varcr), colour="red", angle=90, vjust = 1.2, hjust = 1.2)',
|
|
186
|
+
'scale_fill_manual(values = c("Pass" = "blue3", "Fail" = "red3"))'
|
|
187
|
+
),
|
|
188
|
+
devpars = devpars
|
|
189
|
+
)
|
|
190
|
+
}
|
|
@@ -0,0 +1,100 @@
|
|
|
1
|
+
"""Script for snp.PlinkFilter"""
|
|
2
|
+
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from biopipen.utils.misc import run_command, dict_to_cli_args, logger
|
|
5
|
+
|
|
6
|
+
indir = {{in.indir | repr}} # pyright: ignore # noqa: #999
|
|
7
|
+
samples_file = {{in.samples_file | repr}} # pyright: ignore
|
|
8
|
+
variants_file = {{in.variants_file | repr}} # pyright: ignore
|
|
9
|
+
outdir = {{out.outdir | repr}} # pyright: ignore
|
|
10
|
+
|
|
11
|
+
plink = {{envs.plink | repr}} # pyright: ignore
|
|
12
|
+
ncores = {{envs.ncores | repr}} # pyright: ignore
|
|
13
|
+
samples = {{envs.samples | repr}} # pyright: ignore
|
|
14
|
+
variants = {{envs.variants | repr}} # pyright: ignore
|
|
15
|
+
e_samples_file = {{envs.samples_file | repr}} # pyright: ignore
|
|
16
|
+
e_variants_file = {{envs.variants_file | repr}} # pyright: ignore
|
|
17
|
+
keep = {{envs.keep | repr}} # pyright: ignore
|
|
18
|
+
vfile_type = {{envs.vfile_type | repr}} # pyright: ignore
|
|
19
|
+
chr = {{envs.chr | repr}} # pyright: ignore
|
|
20
|
+
not_chr = {{envs.not_chr | repr}} # pyright: ignore
|
|
21
|
+
autosome = {{envs.autosome | repr}} # pyright: ignore
|
|
22
|
+
autosome_xy = {{envs.autosome_xy | repr}} # pyright: ignore
|
|
23
|
+
snps_only = {{envs.snps_only | repr}} # pyright: ignore
|
|
24
|
+
|
|
25
|
+
samples_file = samples_file or e_samples_file
|
|
26
|
+
if not samples_file and samples:
|
|
27
|
+
samples_file = Path(outdir) / "_samples.txt"
|
|
28
|
+
if isinstance(samples, str):
|
|
29
|
+
samples = [s.strip() for s in samples.split(",")]
|
|
30
|
+
|
|
31
|
+
with open(samples_file, "w") as fh:
|
|
32
|
+
fh.writelines(
|
|
33
|
+
[
|
|
34
|
+
line.replace("/", "\t") + "\n"
|
|
35
|
+
if "/" in line
|
|
36
|
+
else line + "\t" + line + "\n"
|
|
37
|
+
for line in samples
|
|
38
|
+
]
|
|
39
|
+
)
|
|
40
|
+
|
|
41
|
+
variants_file = variants_file or e_variants_file
|
|
42
|
+
if not variants_file and variants:
|
|
43
|
+
if vfile_type != "id":
|
|
44
|
+
logger.warning(
|
|
45
|
+
"envs.vfile_type should be 'id' if only envs.variants is provided."
|
|
46
|
+
)
|
|
47
|
+
vfile_type = "id"
|
|
48
|
+
|
|
49
|
+
variants_file = Path(outdir) / "_variants.txt"
|
|
50
|
+
if isinstance(variants, str):
|
|
51
|
+
variants = [v.strip() for v in variants.split(",")]
|
|
52
|
+
|
|
53
|
+
with open(variants_file, "w") as fh:
|
|
54
|
+
fh.writelines([line + "\n" for line in variants])
|
|
55
|
+
|
|
56
|
+
bedfile = list(Path(indir).glob("*.bed"))
|
|
57
|
+
if len(bedfile) == 0:
|
|
58
|
+
raise FileNotFoundError(f"No .bed file found in `in.indir`")
|
|
59
|
+
elif len(bedfile) > 1:
|
|
60
|
+
logger.warning(f"Multiple .bed files found in `in.indir`, using the first one.")
|
|
61
|
+
|
|
62
|
+
bedfile = bedfile[0]
|
|
63
|
+
input = bedfile.with_suffix("")
|
|
64
|
+
output = Path(outdir) / bedfile.stem
|
|
65
|
+
|
|
66
|
+
args = {
|
|
67
|
+
"": [plink],
|
|
68
|
+
"bfile": input,
|
|
69
|
+
"out": output,
|
|
70
|
+
"threads": ncores,
|
|
71
|
+
"make-bed": True,
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
if keep:
|
|
75
|
+
if samples_file:
|
|
76
|
+
args["keep"] = samples_file
|
|
77
|
+
if variants_file:
|
|
78
|
+
args["extract"] = (
|
|
79
|
+
variants_file if vfile_type == "id" else [vfile_type, variants_file]
|
|
80
|
+
)
|
|
81
|
+
else:
|
|
82
|
+
if samples_file:
|
|
83
|
+
args["remove"] = samples_file
|
|
84
|
+
if variants_file:
|
|
85
|
+
args["exclude"] = (
|
|
86
|
+
variants_file if vfile_type == "id" else [vfile_type, variants_file]
|
|
87
|
+
)
|
|
88
|
+
|
|
89
|
+
if chr:
|
|
90
|
+
args["chr"] = chr
|
|
91
|
+
if not_chr:
|
|
92
|
+
args["not_chr"] = not_chr
|
|
93
|
+
if autosome:
|
|
94
|
+
args["autosome"] = True
|
|
95
|
+
if autosome_xy:
|
|
96
|
+
args["autosome"] = True
|
|
97
|
+
if snps_only:
|
|
98
|
+
args["snps_only"] = snps_only
|
|
99
|
+
|
|
100
|
+
run_command(dict_to_cli_args(args, dashify=True, dup_key=False), fg=True)
|