biopipen 0.28.1__py3-none-any.whl → 0.29.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of biopipen might be problematic. Click here for more details.

Files changed (85) hide show
  1. biopipen/__init__.py +1 -1
  2. biopipen/core/config.toml +8 -0
  3. biopipen/ns/bam.py +0 -2
  4. biopipen/ns/bed.py +35 -0
  5. biopipen/ns/cellranger_pipeline.py +5 -5
  6. biopipen/ns/cnv.py +18 -2
  7. biopipen/ns/cnvkit_pipeline.py +16 -11
  8. biopipen/ns/gene.py +68 -23
  9. biopipen/ns/misc.py +2 -15
  10. biopipen/ns/plot.py +204 -0
  11. biopipen/ns/regulatory.py +214 -0
  12. biopipen/ns/scrna.py +31 -5
  13. biopipen/ns/snp.py +516 -8
  14. biopipen/ns/stats.py +167 -3
  15. biopipen/ns/vcf.py +196 -0
  16. biopipen/reports/snp/PlinkCallRate.svelte +24 -0
  17. biopipen/reports/snp/PlinkFreq.svelte +18 -0
  18. biopipen/reports/snp/PlinkHWE.svelte +18 -0
  19. biopipen/reports/snp/PlinkHet.svelte +18 -0
  20. biopipen/reports/snp/PlinkIBD.svelte +18 -0
  21. biopipen/scripts/bam/CNVpytor.py +144 -46
  22. biopipen/scripts/bed/BedtoolsIntersect.py +54 -0
  23. biopipen/scripts/bed/BedtoolsMerge.py +1 -1
  24. biopipen/scripts/cnv/AneuploidyScore.R +30 -7
  25. biopipen/scripts/cnv/AneuploidyScoreSummary.R +5 -2
  26. biopipen/scripts/cnv/TMADScore.R +21 -5
  27. biopipen/scripts/cnv/TMADScoreSummary.R +6 -2
  28. biopipen/scripts/cnvkit/CNVkitAccess.py +2 -1
  29. biopipen/scripts/cnvkit/CNVkitAutobin.py +3 -2
  30. biopipen/scripts/cnvkit/CNVkitBatch.py +1 -1
  31. biopipen/scripts/cnvkit/CNVkitCoverage.py +2 -1
  32. biopipen/scripts/cnvkit/CNVkitGuessBaits.py +1 -1
  33. biopipen/scripts/cnvkit/CNVkitHeatmap.py +1 -1
  34. biopipen/scripts/cnvkit/CNVkitReference.py +2 -1
  35. biopipen/scripts/delim/SampleInfo.R +10 -5
  36. biopipen/scripts/gene/GeneNameConversion.R +65 -0
  37. biopipen/scripts/gene/GenePromoters.R +61 -0
  38. biopipen/scripts/misc/Shell.sh +15 -0
  39. biopipen/scripts/plot/Manhattan.R +146 -0
  40. biopipen/scripts/plot/QQPlot.R +146 -0
  41. biopipen/scripts/regulatory/MotifAffinityTest.R +226 -0
  42. biopipen/scripts/regulatory/MotifAffinityTest_AtSNP.R +126 -0
  43. biopipen/scripts/regulatory/MotifAffinityTest_MotifBreakR.R +96 -0
  44. biopipen/scripts/regulatory/MotifScan.py +159 -0
  45. biopipen/scripts/regulatory/atSNP.R +33 -0
  46. biopipen/scripts/regulatory/motifBreakR.R +1594 -0
  47. biopipen/scripts/scrna/MarkersFinder.R +69 -67
  48. biopipen/scripts/scrna/SeuratClustering.R +71 -29
  49. biopipen/scripts/scrna/SeuratMap2Ref.R +20 -0
  50. biopipen/scripts/scrna/SeuratPreparing.R +252 -122
  51. biopipen/scripts/scrna/SeuratSubClustering.R +76 -27
  52. biopipen/scripts/snp/MatrixEQTL.R +85 -44
  53. biopipen/scripts/snp/Plink2GTMat.py +133 -0
  54. biopipen/scripts/snp/PlinkCallRate.R +190 -0
  55. biopipen/scripts/snp/PlinkFilter.py +100 -0
  56. biopipen/scripts/snp/PlinkFreq.R +298 -0
  57. biopipen/scripts/snp/PlinkFromVcf.py +78 -0
  58. biopipen/scripts/snp/PlinkHWE.R +80 -0
  59. biopipen/scripts/snp/PlinkHet.R +92 -0
  60. biopipen/scripts/snp/PlinkIBD.R +200 -0
  61. biopipen/scripts/snp/PlinkUpdateName.py +124 -0
  62. biopipen/scripts/stats/Mediation.R +94 -0
  63. biopipen/scripts/stats/MetaPvalue.R +2 -1
  64. biopipen/scripts/stats/MetaPvalue1.R +70 -0
  65. biopipen/scripts/tcr/TCRClusterStats.R +12 -7
  66. biopipen/scripts/vcf/BcftoolsAnnotate.py +91 -0
  67. biopipen/scripts/vcf/BcftoolsFilter.py +90 -0
  68. biopipen/scripts/vcf/BcftoolsSort.py +113 -0
  69. biopipen/scripts/vcf/BcftoolsView.py +73 -0
  70. biopipen/scripts/vcf/VcfFix_utils.py +1 -1
  71. biopipen/scripts/vcf/bcftools_utils.py +52 -0
  72. biopipen/utils/gene.R +83 -37
  73. biopipen/utils/gene.py +108 -60
  74. biopipen/utils/misc.R +56 -0
  75. biopipen/utils/misc.py +5 -2
  76. biopipen/utils/reference.py +54 -10
  77. {biopipen-0.28.1.dist-info → biopipen-0.29.1.dist-info}/METADATA +2 -2
  78. {biopipen-0.28.1.dist-info → biopipen-0.29.1.dist-info}/RECORD +80 -51
  79. {biopipen-0.28.1.dist-info → biopipen-0.29.1.dist-info}/entry_points.txt +1 -1
  80. biopipen/ns/bcftools.py +0 -111
  81. biopipen/scripts/bcftools/BcftoolsAnnotate.py +0 -42
  82. biopipen/scripts/bcftools/BcftoolsFilter.py +0 -79
  83. biopipen/scripts/bcftools/BcftoolsSort.py +0 -19
  84. biopipen/scripts/gene/GeneNameConversion.py +0 -66
  85. {biopipen-0.28.1.dist-info → biopipen-0.29.1.dist-info}/WHEEL +0 -0
@@ -1,5 +1,6 @@
1
1
  source("{{biopipen_dir}}/utils/misc.R")
2
2
  library(rlang)
3
+ library(rtracklayer)
3
4
  library(MatrixEQTL)
4
5
 
5
6
  snpfile = {{in.geno | r}}
@@ -11,6 +12,7 @@ outfile = {{out.cisqtls | r}}
11
12
 
12
13
  model = {{envs.model | r}}
13
14
  pval = {{envs.pval | r}}
15
+ match_samples = {{envs.match_samples | r}}
14
16
  transp = {{envs.transp | r}}
15
17
  fdr = {{envs.fdr | r}}
16
18
  snppos = {{envs.snppos | r}}
@@ -36,7 +38,9 @@ if (!trans_enabled && !cis_enabled) {
36
38
  transp <- 1e-5
37
39
  }
38
40
 
39
- transpose_file <- function(file) {
41
+ transpose_file <- function(file, what) {
42
+ if (is.null(file)) return(NULL)
43
+ log_info("Transposing {what} file ...")
40
44
  out <- file.path(joboutdir, paste0(
41
45
  tools::file_path_sans_ext(basename(file)),
42
46
  ".transposed.",
@@ -47,10 +51,11 @@ transpose_file <- function(file) {
47
51
  out
48
52
  }
49
53
 
50
- if (transpose_geno) snpfile = transpose_file(snpfile)
51
- if (transpose_expr) expfile = transpose_file(expfile)
52
- if (transpose_cov) covfile = transpose_file(covfile)
54
+ if (transpose_geno) snpfile = transpose_file(snpfile, "geno")
55
+ if (transpose_expr) expfile = transpose_file(expfile, "expr")
56
+ if (transpose_cov) covfile = transpose_file(covfile, "cov")
53
57
 
58
+ log_info("Loading SNP data ...")
54
59
  snps = SlicedData$new();
55
60
  snps$fileDelimiter = "\t"; # the TAB character
56
61
  snps$fileOmitCharacters = "NA"; # denote missing values;
@@ -59,6 +64,7 @@ snps$fileSkipColumns = 1; # one column of row labels
59
64
  snps$fileSliceSize = 10000; # read file in pieces of 2,000 rows
60
65
  snps$LoadFile( snpfile );
61
66
 
67
+ log_info("Loading gene expression data ...")
62
68
  gene = SlicedData$new();
63
69
  gene$fileDelimiter = "\t"; # the TAB character
64
70
  gene$fileOmitCharacters = "NA"; # denote missing values;
@@ -69,16 +75,39 @@ gene$LoadFile( expfile );
69
75
 
70
76
  cvrt = SlicedData$new();
71
77
  if (!is.null(covfile) && file.exists(covfile)) {
72
- covmatrix = t(read.table.inopts(covfile, list(cnames=TRUE, rnames=TRUE)))
78
+ log_info("Loading covariate data ...")
79
+ covmatrix = read.table(covfile, header=TRUE, stringsAsFactors=FALSE, row.names=1, sep="\t", quote="", check.names=FALSE)
73
80
  cvrt$CreateFromMatrix( as.matrix(covmatrix) )
74
81
  }
75
82
 
83
+ log_info("Matching samples ...")
84
+ if (match_samples) {
85
+ # let matrixEQTL raise an error if samples do not match
86
+ } else {
87
+ n_sample_snps = snps$nCols()
88
+ n_sample_gene = gene$nCols()
89
+ common_samples = intersect(snps$columnNames, gene$columnNames)
90
+ if (!is.null(covfile)) {
91
+ common_samples = intersect(common_samples, cvrt$columnNames)
92
+ n_sample_cov = cvrt$nCols()
93
+ cvrt = cvrt$ColumnSubsample(match(common_samples, cvrt$columnNames))
94
+ }
95
+ snps = snps$ColumnSubsample(match(common_samples, snps$columnNames))
96
+ gene = gene$ColumnSubsample(match(common_samples, gene$columnNames))
97
+ log_info("- Samples used in SNP data: {n_sample_snps} -> {snps$nCols()}")
98
+ log_info("- Samples used in gene expression data: {n_sample_gene} -> {gene$nCols()}")
99
+ if (!is.null(covfile)) {
100
+ log_info("- Samples used in covariate data: {n_sample_cov} -> {cvrt$nCols()}")
101
+ }
102
+ }
103
+
104
+ log_info("Composing engine parameters ...")
76
105
  engine_params = list()
77
106
  engine_params$snps = snps
78
107
  engine_params$gene = gene
79
108
  engine_params$cvrt = cvrt
80
- engine_params$output_file_name = ifelse(trans_enabled, alleqtl, NULL)
81
- engine_params$pvOutputThreshold = ifelse(trans_enabled, transp, 0)
109
+ engine_params$output_file_name = if(trans_enabled) alleqtl else NULL
110
+ engine_params$pvOutputThreshold = if(trans_enabled) min(transp, 1) else 0
82
111
  engine_params$useModel = model
83
112
  engine_params$errorCovariance = numeric()
84
113
  engine_params$verbose = TRUE
@@ -89,66 +118,78 @@ noq = function(s) {
89
118
  }
90
119
 
91
120
  if (cis_enabled) {
121
+ log_info("Loading SNP positions ...")
92
122
  if (endsWith(snppos, ".bed")) {
93
- snppos_data = read.table.inopts(snppos,
94
- list(cnames=FALSE, rnames=FALSE))
95
- snppos_data = snppos_data[, c(4, 1, 2)]
96
- colnames(snppos_data) = c("snp", "chr", "pos")
123
+ snppos_data = read.table(snppos, header = FALSE, stringsAsFactors = FALSE, sep = "\t")
124
+ snppos_data = data.frame(
125
+ snp = snppos_data$V4,
126
+ chr = snppos_data$V1,
127
+ pos = snppos_data$V3
128
+ )
97
129
  } else if (endsWith(snppos, ".gff") || endsWith(snppos, ".gtf")) {
98
- snppos_data = read.table.inopts(snppos,
99
- list(cnames=FALSE, rnames=FALSE));
100
- snppos_data = snppos_data[, c(9, 1, 4)]
101
- colnames(snppos_data) = c("snp", "chr", "pos")
102
- snppos_data$snp = unlist(lapply(snppos_data$snp, function(x) {
103
- for (s in unlist(strsplit(x, '; ', fixed=T))) {
104
- if (startsWith(s, "snp_id "))
105
- return(noq(substring(s, 8)))
106
- else if (startsWith(s, "rs_id "))
107
- return(noq(substring(s, 7)))
108
- else if (startsWith(s, "rs "))
109
- return(noq(substring(s, 4)))
110
- }
111
- }))
130
+ snppos_data = import(snppos)
131
+ elem_meta = elementMetadata(snppos_data)
132
+ snppos_data = data.frame(
133
+ snp = elem_meta$snp_id %||% elem_meta$rs_id %||% elem_meta$rs,
134
+ chr = as.character(seqnames(snppos_data)),
135
+ pos = start(snppos_data)
136
+ )
112
137
  } else if (endsWith(snppos, ".vcf") || endsWith(snppos, ".vcf.gz")) {
113
- snppos_data = read.table.inopts(snppos,
114
- list(cnames=FALSE, rnames=FALSE))
138
+ snppos_data = read.table(
139
+ snppos,
140
+ header=FALSE,
141
+ row.names=NULL,
142
+ stringsAsFactors=FALSE,
143
+ check.names=FALSE
144
+ )
115
145
  snppos_data = snppos_data[, c(3, 1, 2)]
116
146
  colnames(snppos_data) = c("snp", "chr", "pos")
117
147
  } else {
118
- snppos_data = read.table.inopts(snppos, list(cnames=TRUE))
148
+ snppos_data = read.table(
149
+ snppos,
150
+ header=FALSE,
151
+ row.names=NULL,
152
+ stringsAsFactors=FALSE,
153
+ check.names=FALSE
154
+ )
119
155
  colnames(snppos_data) = c("snp", "chr", "pos")
120
156
  }
121
157
 
158
+ log_info("Loading gene positions ...")
122
159
  if (endsWith(genepos, ".bed")) {
123
- genepos_data = read.table.inopts(genepos,
124
- list(cnames=FALSE, rnames=FALSE))
125
- genepos_data = genepos_data[, c(4, 1:3)]
126
- colnames(genepos_data) = c("geneid", "chr", "s1", "s2")
160
+ genepos_data = read.table(genepos, header = FALSE, stringsAsFactors = FALSE, sep = "\t")
161
+ genepos_data = data.frame(
162
+ geneid = genepos_data$V4,
163
+ chr = genepos_data$V1,
164
+ s1 = genepos_data$V2,
165
+ s2 = genepos_data$V3
166
+ )
127
167
  } else if (endsWith(genepos, ".gff") || endsWith(genepos, ".gtf")) {
128
- genepos_data = read.table.inopts(genepos,
129
- list(cnames=FALSE, rnames=FALSE))
130
- genepos_data = genepos_data[, c(9, 1, 4, 5)]
131
- colnames(genepos_data) = c("geneid", "chr", "s1", "s2")
132
- genepos_data$geneid = noquote(unlist(lapply(genepos_data$geneid, function(x) {
133
- for (s in unlist(strsplit(x, '; ', fixed=T))) {
134
- if (startsWith(s, "gene_id "))
135
- return(noq(substring(s, 9)))
136
- }
137
- })))
168
+ genepos_data = import(genepos)
169
+ elem_meta = elementMetadata(genepos_data)
170
+ genepos_data = data.frame(
171
+ geneid = elem_meta$gene_id %||% elem_meta$gene_name,
172
+ chr = as.character(seqnames(genepos_data)),
173
+ s1 = start(genepos_data),
174
+ s2 = end(genepos_data)
175
+ )
138
176
  } else {
139
177
  genepos_data = read.table(genepos, header = TRUE, stringsAsFactors = FALSE);
140
178
  colnames(genepos_data) = c("geneid", "chr", "s1", "s2")
141
179
  }
142
180
 
181
+ log_info("Running MatrixEQTL with cis-eQTLs enabled ...")
143
182
  engine_params$output_file_name.cis = outfile
144
- engine_params$pvOutputThreshold.cis = pval
183
+ engine_params$pvOutputThreshold.cis = min(pval, 1)
145
184
  engine_params$cisDist = dist
146
185
  engine_params$snpspos = snppos_data
147
186
  engine_params$genepos = genepos_data
148
187
  do_call(Matrix_eQTL_main, engine_params)
188
+ if (!file.exists(alleqtl)) file.create(alleqtl)
149
189
  } else {
190
+ log_info("Running MatrixEQTL without cis-eQTLs ...")
150
191
  do_call(Matrix_eQTL_engine, engine_params)
151
- file.create(outfile)
192
+ if (!file.exists(outfile)) file.create(outfile)
152
193
  }
153
194
 
154
195
  if (pval == 0) {
@@ -0,0 +1,133 @@
1
+
2
+ from os import path
3
+ from glob import glob
4
+ from biopipen.utils.misc import run_command, logger
5
+
6
+ indir = {{in.indir | repr}} # noqa: E999 # pyright: ignore
7
+ outfile = {{out.outfile | repr}} # pyright: ignore
8
+ plink = {{envs.plink | repr}} # pyright: ignore
9
+ ncores = {{envs.ncores | repr}} # pyright: ignore
10
+ transpose = {{envs.transpose | repr}} # pyright: ignore
11
+ samid = {{envs.samid | repr}} # pyright: ignore
12
+ varid = {{envs.varid | repr}} # pyright: ignore
13
+ trans_chr = {{envs.trans_chr | repr}} # pyright: ignore
14
+ missing_id = {{envs.missing_id | repr}} # pyright: ignore
15
+ trans_chr = trans_chr or {}
16
+
17
+ bedfile = glob(path.join(indir, '*.bed'))
18
+ if len(bedfile) == 0:
19
+ raise FileNotFoundError(f"No .bed file found in `in.indir`")
20
+ elif len(bedfile) > 1:
21
+ logger.warning(f"Multiple .bed files found in `in.indir`, using the first one.")
22
+
23
+ bedfile = bedfile[0]
24
+ input = path.splitext(bedfile)[0]
25
+ output = path.splitext(outfile)[0]
26
+
27
+ cmd = [
28
+ plink,
29
+ "--bfile", input,
30
+ "--out", output,
31
+ "--threads", ncores,
32
+ "--keep-allele-order",
33
+ "--recode", "A-transpose" if not transpose else "A",
34
+ ]
35
+ # if transpose:
36
+ # cmd += ["tabx"]
37
+
38
+ run_command(cmd, fg=True, env={"cwd": path.dirname(outfile)})
39
+
40
+ if not transpose: # rows are variants, columns are samples
41
+ # .traw file is created, tab-separated, with the following columns:
42
+ trawfile = output + ".traw"
43
+ # CHR Chromosome code
44
+ # SNP Variant identifier
45
+ # (C)M Position in morgans or centimorgans
46
+ # POS Base-pair coordinate
47
+ # COUNTED Counted allele (defaults to A1), the actual alternative allele
48
+ # with --keep-allele-order
49
+ # ALT Other allele(s), comma-separated, the actual reference allele
50
+ # <FID>_<IID>... Allelic dosages
51
+ # (0/1/2/'NA' for diploid variants, 0/2/'NA' for haploid)
52
+ with open(trawfile, 'r') as fin:
53
+ with open(outfile, 'w') as fout:
54
+ samples = fin.readline().strip().split('\t')[6:]
55
+ header = ["Variant"]
56
+ for sam in samples:
57
+ try:
58
+ fid, iid = sam.split('_')
59
+ except ValueError:
60
+ raise ValueError(
61
+ f"Can't determine FID and IID from sample ID: {sam}, "
62
+ f"extra underscore (_) detected."
63
+ ) from None
64
+ sam = samid.replace('{fid}', fid).replace('{iid}', iid)
65
+ header.append(sam)
66
+ fout.write('\t'.join(header) + '\n')
67
+
68
+ for line in fin:
69
+ line = line.strip().split('\t')
70
+ chrom = trans_chr.get(line[0], line[0])
71
+ var = line[1]
72
+ if var == "." or var == "":
73
+ var = missing_id
74
+ pos = line[3]
75
+ ref = line[5]
76
+ alt = line[4]
77
+ variant = (
78
+ varid
79
+ .replace('{chr}', chrom)
80
+ .replace('{varid}', var)
81
+ .replace('{pos}', pos)
82
+ .replace('{ref}', ref)
83
+ .replace('{alt}', alt)
84
+ )
85
+ record = [variant] + line[6:]
86
+ fout.write('\t'.join(record) + '\n')
87
+
88
+ else:
89
+ # .raw file is created, tab-separated, with the following columns:
90
+ rawfile = output + ".raw"
91
+ # FID Family ID
92
+ # IID Individual ID
93
+ # PAT Paternal ID
94
+ # MAT Maternal ID
95
+ # SEX Sex (1 = male, 2 = female, 0 = unknown)
96
+ # PHENOTYPE Main phenotype value
97
+ # <VariantID>... Allelic dosage (0/1/2/NA for diploid variants, 0/2/NA for haploid)
98
+ #
99
+ # Variant information may not be included in <VariantID>
100
+ # We use the .bim file to get the variant information
101
+ bimfile = input + ".bim"
102
+ with open(rawfile, 'r') as fin:
103
+ with open(outfile, 'w') as fout:
104
+ header = ["Sample"]
105
+ with open(bimfile, 'r') as fbim:
106
+ for line in fbim:
107
+ line = line.strip().split('\t')
108
+ chrom = trans_chr.get(line[0], line[0])
109
+ var = line[1]
110
+ if var == "." or var == "":
111
+ var = missing_id
112
+ pos = line[3]
113
+ ref = line[5]
114
+ alt = line[4]
115
+ variant = (
116
+ varid
117
+ .replace('{chr}', chrom)
118
+ .replace('{varid}', var)
119
+ .replace('{pos}', pos)
120
+ .replace('{ref}', ref)
121
+ .replace('{alt}', alt)
122
+ )
123
+ header.append(variant)
124
+ fout.write('\t'.join(header) + '\n')
125
+
126
+ next(fin) # skip header
127
+ for line in fin:
128
+ line = line.strip().split('\t')
129
+ fid = line[0]
130
+ iid = line[1]
131
+ sam = samid.replace('{fid}', fid).replace('{iid}', iid)
132
+ record = [sam] + line[6:]
133
+ fout.write('\t'.join(record) + '\n')
@@ -0,0 +1,190 @@
1
+ source("{{biopipen_dir}}/utils/misc.R")
2
+ source("{{biopipen_dir}}/utils/plot.R")
3
+ library(ggprism)
4
+ theme_set(theme_prism())
5
+
6
+ indir <- {{in.indir | r}}
7
+ outdir <- {{out.outdir | r}}
8
+ plink <- {{envs.plink | r}}
9
+ ncores <- {{envs.ncores | r}}
10
+ doplot <- {{envs.plot | r}}
11
+ devpars <- {{envs.devpars | r}}
12
+ samplecr <- {{envs.samplecr | r}}
13
+ varcr <- {{envs.varcr | r}}
14
+ max_iter <- {{envs.max_iter | r}}
15
+
16
+ bedfile = Sys.glob(file.path(indir, '*.bed'))
17
+ if (length(bedfile) == 0)
18
+ stop("No bed files found in the input directory.")
19
+ if (length(bedfile) > 1) {
20
+ log_warn("Multiple bed files found in the input directory. Using the first one.")
21
+ bedfile <- bedfile[1]
22
+ }
23
+ input <- tools::file_path_sans_ext(bedfile)
24
+ output <- file.path(outdir, basename(input))
25
+
26
+ all_smiss_file = paste0(output, '.smiss')
27
+ all_vmiss_file = paste0(output, '.vmiss')
28
+ all_samplecr_fail_file = paste0(output, '.samplecr.fail')
29
+ all_varcr_fail_file = paste0(output, '.varcr.fail')
30
+ if (file.exists(all_smiss_file)) invisible(file.remove(all_smiss_file))
31
+ if (file.exists(all_vmiss_file)) invisible(file.remove(all_vmiss_file))
32
+ for (i in 1:max_iter) {
33
+ log_info("Iteration {i} ...")
34
+ # iter_out <- paste0(output, "-", i)
35
+ iter_dir <- file.path(outdir, paste0("iter", i))
36
+ dir.create(iter_dir, showWarnings = FALSE)
37
+ iter_out <- file.path(iter_dir, basename(output))
38
+ cmd <- c(
39
+ plink,
40
+ "--threads", ncores,
41
+ "--bfile", input,
42
+ "--missing",
43
+ "--out", iter_out
44
+ )
45
+ run_command(cmd, fg = TRUE)
46
+
47
+ smissfile <- paste0(iter_out, '.smiss')
48
+ smiss <- read.table(
49
+ smissfile,
50
+ header = TRUE,
51
+ row.names = NULL,
52
+ check.names = FALSE,
53
+ comment.char = ""
54
+ )
55
+ smiss$Iteration <- i
56
+ # append it to all_smiss_file
57
+ write.table(
58
+ smiss,
59
+ all_smiss_file,
60
+ append = i > 1,
61
+ col.names = !file.exists(all_smiss_file),
62
+ row.names = FALSE,
63
+ sep = "\t",
64
+ quote = FALSE
65
+ )
66
+ callrate.sample <- data.frame(Callrate = 1 - smiss$F_MISS)
67
+ rownames(callrate.sample) <- paste(smiss$FID, smiss$IID, sep = "\t")
68
+ callrate.sample.fail = rownames(callrate.sample[
69
+ callrate.sample$Callrate < samplecr, , drop = FALSE
70
+ ])
71
+ writeLines(callrate.sample.fail, con = file(paste0(iter_out, '.samplecr.fail')))
72
+ # append it to all_samplecr_fail_file
73
+ write(
74
+ paste0(sapply(
75
+ callrate.sample.fail,
76
+ function(x){ paste0(x, "\n") }
77
+ ), collapse = ""),
78
+ file = file(all_samplecr_fail_file),
79
+ append = i > 1
80
+ )
81
+
82
+ vmiss <- read.table(
83
+ paste0(iter_out, '.vmiss'),
84
+ header = TRUE,
85
+ row.names = NULL,
86
+ check.names = FALSE,
87
+ comment.char = ""
88
+ )
89
+ vmiss$Iteration <- i
90
+ # append it to all_vmiss_file
91
+ write.table(
92
+ vmiss,
93
+ all_vmiss_file,
94
+ append = i > 1,
95
+ col.names = !file.exists(all_vmiss_file),
96
+ row.names = FALSE,
97
+ sep = "\t",
98
+ quote = FALSE
99
+ )
100
+ vmiss$Callrate <- 1 - vmiss$F_MISS
101
+ callrate.var.fail <- vmiss[which(vmiss$Callrate < varcr), 'ID', drop = TRUE]
102
+ writeLines(callrate.var.fail, con = file(paste0(iter_out, '.varcr.fail')))
103
+ # append it to all_varcr_fail_file
104
+ write(
105
+ paste0(sapply(
106
+ callrate.var.fail,
107
+ function(x){ paste0(x, "\n") }
108
+ ), collapse = ""),
109
+ file = file(all_varcr_fail_file),
110
+ append = i > 1
111
+ )
112
+
113
+ if (length(callrate.sample.fail) == 0 && length(callrate.var.fail) == 0) {
114
+ # make symbolic links to output from input .bed, .bim and .fam files
115
+ file.symlink(paste0(input, '.bed'), paste0(output, '.bed'))
116
+ file.symlink(paste0(input, '.bim'), paste0(output, '.bim'))
117
+ file.symlink(paste0(input, '.fam'), paste0(output, '.fam'))
118
+ break
119
+ }
120
+
121
+ # remove samples in iter_out.samplecr.fail and variants in iter_out.varcr.fail
122
+ cmd <- c(
123
+ plink,
124
+ "--threads", ncores,
125
+ "--bfile", input,
126
+ "--remove", paste0(iter_out, '.samplecr.fail'),
127
+ "--exclude", paste0(iter_out, '.varcr.fail'),
128
+ "--make-bed",
129
+ "--out", iter_out
130
+ )
131
+ run_command(cmd, fg = TRUE)
132
+ input <- iter_out
133
+ }
134
+
135
+ smiss <- read.table(
136
+ smissfile,
137
+ header = TRUE,
138
+ row.names = NULL,
139
+ check.names = FALSE,
140
+ comment.char = ""
141
+ )
142
+ callrate.sample <- data.frame(Callrate = 1 - smiss$F_MISS)
143
+ rownames(callrate.sample) <- paste(smiss$FID, smiss$IID, sep = "\t")
144
+
145
+ vmiss <- read.table(
146
+ paste0(iter_out, '.vmiss'),
147
+ header = TRUE,
148
+ row.names = NULL,
149
+ check.names = FALSE,
150
+ comment.char = ""
151
+ )
152
+ vmiss$Callrate <- 1 - vmiss$F_MISS
153
+
154
+ if (doplot) {
155
+ log_info("Plotting ...")
156
+ callrate.sample$Status <- "Pass"
157
+ callrate.sample[callrate.sample.fail, "Status"] <- "Fail"
158
+ plotGG(
159
+ data = callrate.sample,
160
+ geom = "histogram",
161
+ outfile = paste0(output, '.samplecr.png'),
162
+ args = list(aes(fill = Status, x = Callrate), alpha = 0.8, bins = 50),
163
+ ggs = c(
164
+ 'xlab("Sample Call Rate")',
165
+ 'ylab("Count")',
166
+ 'geom_vline(xintercept = samplecr, color = "red", linetype="dashed")',
167
+ 'theme(legend.position = "none")',
168
+ 'geom_text(aes(x = samplecr, y = Inf, label = samplecr), colour="red", angle=90, vjust = 1.2, hjust = 1.2)',
169
+ 'scale_fill_manual(values = c("Pass" = "blue3", "Fail" = "red3"))'
170
+ )
171
+ )
172
+
173
+ vmiss$Status <- "Pass"
174
+ vmiss[which(vmiss$Callrate < varcr), "Status"] <- "Fail"
175
+ plotGG(
176
+ data = vmiss,
177
+ geom = "histogram",
178
+ outfile = paste0(output, '.varcr.png'),
179
+ args = list(aes(fill = Status, x = Callrate), alpha = 0.8, bins = 50),
180
+ ggs = c(
181
+ 'xlab("Variant Call Rate")',
182
+ 'ylab("Count")',
183
+ 'geom_vline(xintercept = varcr, color = "red", linetype="dashed")',
184
+ 'theme(legend.position = "none")',
185
+ 'geom_text(aes(x = varcr, y = Inf, label = varcr), colour="red", angle=90, vjust = 1.2, hjust = 1.2)',
186
+ 'scale_fill_manual(values = c("Pass" = "blue3", "Fail" = "red3"))'
187
+ ),
188
+ devpars = devpars
189
+ )
190
+ }
@@ -0,0 +1,100 @@
1
+ """Script for snp.PlinkFilter"""
2
+
3
+ from pathlib import Path
4
+ from biopipen.utils.misc import run_command, dict_to_cli_args, logger
5
+
6
+ indir = {{in.indir | repr}} # pyright: ignore # noqa: #999
7
+ samples_file = {{in.samples_file | repr}} # pyright: ignore
8
+ variants_file = {{in.variants_file | repr}} # pyright: ignore
9
+ outdir = {{out.outdir | repr}} # pyright: ignore
10
+
11
+ plink = {{envs.plink | repr}} # pyright: ignore
12
+ ncores = {{envs.ncores | repr}} # pyright: ignore
13
+ samples = {{envs.samples | repr}} # pyright: ignore
14
+ variants = {{envs.variants | repr}} # pyright: ignore
15
+ e_samples_file = {{envs.samples_file | repr}} # pyright: ignore
16
+ e_variants_file = {{envs.variants_file | repr}} # pyright: ignore
17
+ keep = {{envs.keep | repr}} # pyright: ignore
18
+ vfile_type = {{envs.vfile_type | repr}} # pyright: ignore
19
+ chr = {{envs.chr | repr}} # pyright: ignore
20
+ not_chr = {{envs.not_chr | repr}} # pyright: ignore
21
+ autosome = {{envs.autosome | repr}} # pyright: ignore
22
+ autosome_xy = {{envs.autosome_xy | repr}} # pyright: ignore
23
+ snps_only = {{envs.snps_only | repr}} # pyright: ignore
24
+
25
+ samples_file = samples_file or e_samples_file
26
+ if not samples_file and samples:
27
+ samples_file = Path(outdir) / "_samples.txt"
28
+ if isinstance(samples, str):
29
+ samples = [s.strip() for s in samples.split(",")]
30
+
31
+ with open(samples_file, "w") as fh:
32
+ fh.writelines(
33
+ [
34
+ line.replace("/", "\t") + "\n"
35
+ if "/" in line
36
+ else line + "\t" + line + "\n"
37
+ for line in samples
38
+ ]
39
+ )
40
+
41
+ variants_file = variants_file or e_variants_file
42
+ if not variants_file and variants:
43
+ if vfile_type != "id":
44
+ logger.warning(
45
+ "envs.vfile_type should be 'id' if only envs.variants is provided."
46
+ )
47
+ vfile_type = "id"
48
+
49
+ variants_file = Path(outdir) / "_variants.txt"
50
+ if isinstance(variants, str):
51
+ variants = [v.strip() for v in variants.split(",")]
52
+
53
+ with open(variants_file, "w") as fh:
54
+ fh.writelines([line + "\n" for line in variants])
55
+
56
+ bedfile = list(Path(indir).glob("*.bed"))
57
+ if len(bedfile) == 0:
58
+ raise FileNotFoundError(f"No .bed file found in `in.indir`")
59
+ elif len(bedfile) > 1:
60
+ logger.warning(f"Multiple .bed files found in `in.indir`, using the first one.")
61
+
62
+ bedfile = bedfile[0]
63
+ input = bedfile.with_suffix("")
64
+ output = Path(outdir) / bedfile.stem
65
+
66
+ args = {
67
+ "": [plink],
68
+ "bfile": input,
69
+ "out": output,
70
+ "threads": ncores,
71
+ "make-bed": True,
72
+ }
73
+
74
+ if keep:
75
+ if samples_file:
76
+ args["keep"] = samples_file
77
+ if variants_file:
78
+ args["extract"] = (
79
+ variants_file if vfile_type == "id" else [vfile_type, variants_file]
80
+ )
81
+ else:
82
+ if samples_file:
83
+ args["remove"] = samples_file
84
+ if variants_file:
85
+ args["exclude"] = (
86
+ variants_file if vfile_type == "id" else [vfile_type, variants_file]
87
+ )
88
+
89
+ if chr:
90
+ args["chr"] = chr
91
+ if not_chr:
92
+ args["not_chr"] = not_chr
93
+ if autosome:
94
+ args["autosome"] = True
95
+ if autosome_xy:
96
+ args["autosome"] = True
97
+ if snps_only:
98
+ args["snps_only"] = snps_only
99
+
100
+ run_command(dict_to_cli_args(args, dashify=True, dup_key=False), fg=True)