biopipen 0.28.1__py3-none-any.whl → 0.29.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of biopipen might be problematic. Click here for more details.

Files changed (82) hide show
  1. biopipen/__init__.py +1 -1
  2. biopipen/core/config.toml +8 -0
  3. biopipen/ns/bam.py +0 -2
  4. biopipen/ns/bed.py +35 -0
  5. biopipen/ns/cellranger_pipeline.py +5 -5
  6. biopipen/ns/cnv.py +18 -2
  7. biopipen/ns/cnvkit_pipeline.py +16 -11
  8. biopipen/ns/gene.py +68 -23
  9. biopipen/ns/misc.py +2 -15
  10. biopipen/ns/plot.py +146 -0
  11. biopipen/ns/regulation.py +214 -0
  12. biopipen/ns/scrna.py +15 -3
  13. biopipen/ns/snp.py +516 -8
  14. biopipen/ns/stats.py +74 -2
  15. biopipen/ns/vcf.py +196 -0
  16. biopipen/reports/snp/PlinkCallRate.svelte +24 -0
  17. biopipen/reports/snp/PlinkFreq.svelte +18 -0
  18. biopipen/reports/snp/PlinkHWE.svelte +18 -0
  19. biopipen/reports/snp/PlinkHet.svelte +18 -0
  20. biopipen/reports/snp/PlinkIBD.svelte +18 -0
  21. biopipen/scripts/bam/CNVpytor.py +144 -46
  22. biopipen/scripts/bed/BedtoolsIntersect.py +54 -0
  23. biopipen/scripts/bed/BedtoolsMerge.py +1 -1
  24. biopipen/scripts/cnv/AneuploidyScore.R +30 -7
  25. biopipen/scripts/cnv/AneuploidyScoreSummary.R +5 -2
  26. biopipen/scripts/cnv/TMADScore.R +21 -5
  27. biopipen/scripts/cnv/TMADScoreSummary.R +6 -2
  28. biopipen/scripts/cnvkit/CNVkitAccess.py +2 -1
  29. biopipen/scripts/cnvkit/CNVkitAutobin.py +3 -2
  30. biopipen/scripts/cnvkit/CNVkitBatch.py +1 -1
  31. biopipen/scripts/cnvkit/CNVkitCoverage.py +2 -1
  32. biopipen/scripts/cnvkit/CNVkitGuessBaits.py +1 -1
  33. biopipen/scripts/cnvkit/CNVkitHeatmap.py +1 -1
  34. biopipen/scripts/cnvkit/CNVkitReference.py +2 -1
  35. biopipen/scripts/gene/GeneNameConversion.R +65 -0
  36. biopipen/scripts/gene/GenePromoters.R +61 -0
  37. biopipen/scripts/misc/Shell.sh +15 -0
  38. biopipen/scripts/plot/Manhattan.R +140 -0
  39. biopipen/scripts/plot/QQPlot.R +62 -0
  40. biopipen/scripts/regulation/MotifAffinityTest.R +226 -0
  41. biopipen/scripts/regulation/MotifAffinityTest_AtSNP.R +126 -0
  42. biopipen/scripts/regulation/MotifAffinityTest_MotifBreakR.R +96 -0
  43. biopipen/scripts/regulation/MotifScan.py +159 -0
  44. biopipen/scripts/regulation/atSNP.R +33 -0
  45. biopipen/scripts/regulation/motifBreakR.R +1594 -0
  46. biopipen/scripts/scrna/MarkersFinder.R +59 -67
  47. biopipen/scripts/scrna/SeuratClustering.R +63 -29
  48. biopipen/scripts/scrna/SeuratMap2Ref.R +20 -0
  49. biopipen/scripts/scrna/SeuratSubClustering.R +76 -27
  50. biopipen/scripts/snp/MatrixEQTL.R +84 -43
  51. biopipen/scripts/snp/Plink2GTMat.py +133 -0
  52. biopipen/scripts/snp/PlinkCallRate.R +190 -0
  53. biopipen/scripts/snp/PlinkFilter.py +100 -0
  54. biopipen/scripts/snp/PlinkFreq.R +298 -0
  55. biopipen/scripts/snp/PlinkFromVcf.py +78 -0
  56. biopipen/scripts/snp/PlinkHWE.R +80 -0
  57. biopipen/scripts/snp/PlinkHet.R +92 -0
  58. biopipen/scripts/snp/PlinkIBD.R +197 -0
  59. biopipen/scripts/snp/PlinkUpdateName.py +124 -0
  60. biopipen/scripts/stats/MetaPvalue.R +2 -1
  61. biopipen/scripts/stats/MetaPvalue1.R +70 -0
  62. biopipen/scripts/tcr/TCRClusterStats.R +12 -7
  63. biopipen/scripts/vcf/BcftoolsAnnotate.py +91 -0
  64. biopipen/scripts/vcf/BcftoolsFilter.py +90 -0
  65. biopipen/scripts/vcf/BcftoolsSort.py +113 -0
  66. biopipen/scripts/vcf/BcftoolsView.py +73 -0
  67. biopipen/scripts/vcf/VcfFix_utils.py +1 -1
  68. biopipen/scripts/vcf/bcftools_utils.py +52 -0
  69. biopipen/utils/gene.R +83 -37
  70. biopipen/utils/gene.py +108 -60
  71. biopipen/utils/misc.R +56 -0
  72. biopipen/utils/misc.py +5 -2
  73. biopipen/utils/reference.py +54 -10
  74. {biopipen-0.28.1.dist-info → biopipen-0.29.0.dist-info}/METADATA +2 -2
  75. {biopipen-0.28.1.dist-info → biopipen-0.29.0.dist-info}/RECORD +77 -49
  76. {biopipen-0.28.1.dist-info → biopipen-0.29.0.dist-info}/entry_points.txt +1 -1
  77. biopipen/ns/bcftools.py +0 -111
  78. biopipen/scripts/bcftools/BcftoolsAnnotate.py +0 -42
  79. biopipen/scripts/bcftools/BcftoolsFilter.py +0 -79
  80. biopipen/scripts/bcftools/BcftoolsSort.py +0 -19
  81. biopipen/scripts/gene/GeneNameConversion.py +0 -66
  82. {biopipen-0.28.1.dist-info → biopipen-0.29.0.dist-info}/WHEEL +0 -0
@@ -0,0 +1,133 @@
1
+
2
+ from os import path
3
+ from glob import glob
4
+ from biopipen.utils.misc import run_command, logger
5
+
6
+ indir = {{in.indir | repr}} # noqa: E999 # pyright: ignore
7
+ outfile = {{out.outfile | repr}} # pyright: ignore
8
+ plink = {{envs.plink | repr}} # pyright: ignore
9
+ ncores = {{envs.ncores | repr}} # pyright: ignore
10
+ transpose = {{envs.transpose | repr}} # pyright: ignore
11
+ samid = {{envs.samid | repr}} # pyright: ignore
12
+ varid = {{envs.varid | repr}} # pyright: ignore
13
+ trans_chr = {{envs.trans_chr | repr}} # pyright: ignore
14
+ missing_id = {{envs.missing_id | repr}} # pyright: ignore
15
+ trans_chr = trans_chr or {}
16
+
17
+ bedfile = glob(path.join(indir, '*.bed'))
18
+ if len(bedfile) == 0:
19
+ raise FileNotFoundError(f"No .bed file found in `in.indir`")
20
+ elif len(bedfile) > 1:
21
+ logger.warning(f"Multiple .bed files found in `in.indir`, using the first one.")
22
+
23
+ bedfile = bedfile[0]
24
+ input = path.splitext(bedfile)[0]
25
+ output = path.splitext(outfile)[0]
26
+
27
+ cmd = [
28
+ plink,
29
+ "--bfile", input,
30
+ "--out", output,
31
+ "--threads", ncores,
32
+ "--keep-allele-order",
33
+ "--recode", "A-transpose" if not transpose else "A",
34
+ ]
35
+ # if transpose:
36
+ # cmd += ["tabx"]
37
+
38
+ run_command(cmd, fg=True, env={"cwd": path.dirname(outfile)})
39
+
40
+ if not transpose: # rows are variants, columns are samples
41
+ # .traw file is created, tab-separated, with the following columns:
42
+ trawfile = output + ".traw"
43
+ # CHR Chromosome code
44
+ # SNP Variant identifier
45
+ # (C)M Position in morgans or centimorgans
46
+ # POS Base-pair coordinate
47
+ # COUNTED Counted allele (defaults to A1), the actual alternative allele
48
+ # with --keep-allele-order
49
+ # ALT Other allele(s), comma-separated, the actual reference allele
50
+ # <FID>_<IID>... Allelic dosages
51
+ # (0/1/2/'NA' for diploid variants, 0/2/'NA' for haploid)
52
+ with open(trawfile, 'r') as fin:
53
+ with open(outfile, 'w') as fout:
54
+ samples = fin.readline().strip().split('\t')[6:]
55
+ header = ["Variant"]
56
+ for sam in samples:
57
+ try:
58
+ fid, iid = sam.split('_')
59
+ except ValueError:
60
+ raise ValueError(
61
+ f"Can't determine FID and IID from sample ID: {sam}, "
62
+ f"extra underscore (_) detected."
63
+ ) from None
64
+ sam = samid.replace('{fid}', fid).replace('{iid}', iid)
65
+ header.append(sam)
66
+ fout.write('\t'.join(header) + '\n')
67
+
68
+ for line in fin:
69
+ line = line.strip().split('\t')
70
+ chrom = trans_chr.get(line[0], line[0])
71
+ var = line[1]
72
+ if var == "." or var == "":
73
+ var = missing_id
74
+ pos = line[3]
75
+ ref = line[5]
76
+ alt = line[4]
77
+ variant = (
78
+ varid
79
+ .replace('{chr}', chrom)
80
+ .replace('{varid}', var)
81
+ .replace('{pos}', pos)
82
+ .replace('{ref}', ref)
83
+ .replace('{alt}', alt)
84
+ )
85
+ record = [variant] + line[6:]
86
+ fout.write('\t'.join(record) + '\n')
87
+
88
+ else:
89
+ # .raw file is created, tab-separated, with the following columns:
90
+ rawfile = output + ".raw"
91
+ # FID Family ID
92
+ # IID Individual ID
93
+ # PAT Paternal ID
94
+ # MAT Maternal ID
95
+ # SEX Sex (1 = male, 2 = female, 0 = unknown)
96
+ # PHENOTYPE Main phenotype value
97
+ # <VariantID>... Allelic dosage (0/1/2/NA for diploid variants, 0/2/NA for haploid)
98
+ #
99
+ # Variant information may not be included in <VariantID>
100
+ # We use the .bim file to get the variant information
101
+ bimfile = input + ".bim"
102
+ with open(rawfile, 'r') as fin:
103
+ with open(outfile, 'w') as fout:
104
+ header = ["Sample"]
105
+ with open(bimfile, 'r') as fbim:
106
+ for line in fbim:
107
+ line = line.strip().split('\t')
108
+ chrom = trans_chr.get(line[0], line[0])
109
+ var = line[1]
110
+ if var == "." or var == "":
111
+ var = missing_id
112
+ pos = line[3]
113
+ ref = line[5]
114
+ alt = line[4]
115
+ variant = (
116
+ varid
117
+ .replace('{chr}', chrom)
118
+ .replace('{varid}', var)
119
+ .replace('{pos}', pos)
120
+ .replace('{ref}', ref)
121
+ .replace('{alt}', alt)
122
+ )
123
+ header.append(variant)
124
+ fout.write('\t'.join(header) + '\n')
125
+
126
+ next(fin) # skip header
127
+ for line in fin:
128
+ line = line.strip().split('\t')
129
+ fid = line[0]
130
+ iid = line[1]
131
+ sam = samid.replace('{fid}', fid).replace('{iid}', iid)
132
+ record = [sam] + line[6:]
133
+ fout.write('\t'.join(record) + '\n')
@@ -0,0 +1,190 @@
1
+ source("{{biopipen_dir}}/utils/misc.R")
2
+ source("{{biopipen_dir}}/utils/plot.R")
3
+ library(ggprism)
4
+ theme_set(theme_prism())
5
+
6
+ indir <- {{in.indir | r}}
7
+ outdir <- {{out.outdir | r}}
8
+ plink <- {{envs.plink | r}}
9
+ ncores <- {{envs.ncores | r}}
10
+ doplot <- {{envs.plot | r}}
11
+ devpars <- {{envs.devpars | r}}
12
+ samplecr <- {{envs.samplecr | r}}
13
+ varcr <- {{envs.varcr | r}}
14
+ max_iter <- {{envs.max_iter | r}}
15
+
16
+ bedfile = Sys.glob(file.path(indir, '*.bed'))
17
+ if (length(bedfile) == 0)
18
+ stop("No bed files found in the input directory.")
19
+ if (length(bedfile) > 1) {
20
+ log_warn("Multiple bed files found in the input directory. Using the first one.")
21
+ bedfile <- bedfile[1]
22
+ }
23
+ input <- tools::file_path_sans_ext(bedfile)
24
+ output <- file.path(outdir, basename(input))
25
+
26
+ all_smiss_file = paste0(output, '.smiss')
27
+ all_vmiss_file = paste0(output, '.vmiss')
28
+ all_samplecr_fail_file = paste0(output, '.samplecr.fail')
29
+ all_varcr_fail_file = paste0(output, '.varcr.fail')
30
+ if (file.exists(all_smiss_file)) invisible(file.remove(all_smiss_file))
31
+ if (file.exists(all_vmiss_file)) invisible(file.remove(all_vmiss_file))
32
+ for (i in 1:max_iter) {
33
+ log_info("Iteration {i} ...")
34
+ # iter_out <- paste0(output, "-", i)
35
+ iter_dir <- file.path(outdir, paste0("iter", i))
36
+ dir.create(iter_dir, showWarnings = FALSE)
37
+ iter_out <- file.path(iter_dir, basename(output))
38
+ cmd <- c(
39
+ plink,
40
+ "--threads", ncores,
41
+ "--bfile", input,
42
+ "--missing",
43
+ "--out", iter_out
44
+ )
45
+ run_command(cmd, fg = TRUE)
46
+
47
+ smissfile <- paste0(iter_out, '.smiss')
48
+ smiss <- read.table(
49
+ smissfile,
50
+ header = TRUE,
51
+ row.names = NULL,
52
+ check.names = FALSE,
53
+ comment.char = ""
54
+ )
55
+ smiss$Iteration <- i
56
+ # append it to all_smiss_file
57
+ write.table(
58
+ smiss,
59
+ all_smiss_file,
60
+ append = i > 1,
61
+ col.names = !file.exists(all_smiss_file),
62
+ row.names = FALSE,
63
+ sep = "\t",
64
+ quote = FALSE
65
+ )
66
+ callrate.sample <- data.frame(Callrate = 1 - smiss$F_MISS)
67
+ rownames(callrate.sample) <- paste(smiss$FID, smiss$IID, sep = "\t")
68
+ callrate.sample.fail = rownames(callrate.sample[
69
+ callrate.sample$Callrate < samplecr, , drop = FALSE
70
+ ])
71
+ writeLines(callrate.sample.fail, con = file(paste0(iter_out, '.samplecr.fail')))
72
+ # append it to all_samplecr_fail_file
73
+ write(
74
+ paste0(sapply(
75
+ callrate.sample.fail,
76
+ function(x){ paste0(x, "\n") }
77
+ ), collapse = ""),
78
+ file = file(all_samplecr_fail_file),
79
+ append = i > 1
80
+ )
81
+
82
+ vmiss <- read.table(
83
+ paste0(iter_out, '.vmiss'),
84
+ header = TRUE,
85
+ row.names = NULL,
86
+ check.names = FALSE,
87
+ comment.char = ""
88
+ )
89
+ vmiss$Iteration <- i
90
+ # append it to all_vmiss_file
91
+ write.table(
92
+ vmiss,
93
+ all_vmiss_file,
94
+ append = i > 1,
95
+ col.names = !file.exists(all_vmiss_file),
96
+ row.names = FALSE,
97
+ sep = "\t",
98
+ quote = FALSE
99
+ )
100
+ vmiss$Callrate <- 1 - vmiss$F_MISS
101
+ callrate.var.fail <- vmiss[which(vmiss$Callrate < varcr), 'ID', drop = TRUE]
102
+ writeLines(callrate.var.fail, con = file(paste0(iter_out, '.varcr.fail')))
103
+ # append it to all_varcr_fail_file
104
+ write(
105
+ paste0(sapply(
106
+ callrate.var.fail,
107
+ function(x){ paste0(x, "\n") }
108
+ ), collapse = ""),
109
+ file = file(all_varcr_fail_file),
110
+ append = i > 1
111
+ )
112
+
113
+ if (length(callrate.sample.fail) == 0 && length(callrate.var.fail) == 0) {
114
+ # make symbolic links to output from input .bed, .bim and .fam files
115
+ file.symlink(paste0(input, '.bed'), paste0(output, '.bed'))
116
+ file.symlink(paste0(input, '.bim'), paste0(output, '.bim'))
117
+ file.symlink(paste0(input, '.fam'), paste0(output, '.fam'))
118
+ break
119
+ }
120
+
121
+ # remove samples in iter_out.samplecr.fail and variants in iter_out.varcr.fail
122
+ cmd <- c(
123
+ plink,
124
+ "--threads", ncores,
125
+ "--bfile", input,
126
+ "--remove", paste0(iter_out, '.samplecr.fail'),
127
+ "--exclude", paste0(iter_out, '.varcr.fail'),
128
+ "--make-bed",
129
+ "--out", iter_out
130
+ )
131
+ run_command(cmd, fg = TRUE)
132
+ input <- iter_out
133
+ }
134
+
135
+ smiss <- read.table(
136
+ smissfile,
137
+ header = TRUE,
138
+ row.names = NULL,
139
+ check.names = FALSE,
140
+ comment.char = ""
141
+ )
142
+ callrate.sample <- data.frame(Callrate = 1 - smiss$F_MISS)
143
+ rownames(callrate.sample) <- paste(smiss$FID, smiss$IID, sep = "\t")
144
+
145
+ vmiss <- read.table(
146
+ paste0(iter_out, '.vmiss'),
147
+ header = TRUE,
148
+ row.names = NULL,
149
+ check.names = FALSE,
150
+ comment.char = ""
151
+ )
152
+ vmiss$Callrate <- 1 - vmiss$F_MISS
153
+
154
+ if (doplot) {
155
+ log_info("Plotting ...")
156
+ callrate.sample$Status <- "Pass"
157
+ callrate.sample[callrate.sample.fail, "Status"] <- "Fail"
158
+ plotGG(
159
+ data = callrate.sample,
160
+ geom = "histogram",
161
+ outfile = paste0(output, '.samplecr.png'),
162
+ args = list(aes(fill = Status, x = Callrate), alpha = 0.8, bins = 50),
163
+ ggs = c(
164
+ 'xlab("Sample Call Rate")',
165
+ 'ylab("Count")',
166
+ 'geom_vline(xintercept = samplecr, color = "red", linetype="dashed")',
167
+ 'theme(legend.position = "none")',
168
+ 'geom_text(aes(x = samplecr, y = Inf, label = samplecr), colour="red", angle=90, vjust = 1.2, hjust = 1.2)',
169
+ 'scale_fill_manual(values = c("Pass" = "blue3", "Fail" = "red3"))'
170
+ )
171
+ )
172
+
173
+ vmiss$Status <- "Pass"
174
+ vmiss[which(vmiss$Callrate < varcr), "Status"] <- "Fail"
175
+ plotGG(
176
+ data = vmiss,
177
+ geom = "histogram",
178
+ outfile = paste0(output, '.varcr.png'),
179
+ args = list(aes(fill = Status, x = Callrate), alpha = 0.8, bins = 50),
180
+ ggs = c(
181
+ 'xlab("Variant Call Rate")',
182
+ 'ylab("Count")',
183
+ 'geom_vline(xintercept = varcr, color = "red", linetype="dashed")',
184
+ 'theme(legend.position = "none")',
185
+ 'geom_text(aes(x = varcr, y = Inf, label = varcr), colour="red", angle=90, vjust = 1.2, hjust = 1.2)',
186
+ 'scale_fill_manual(values = c("Pass" = "blue3", "Fail" = "red3"))'
187
+ ),
188
+ devpars = devpars
189
+ )
190
+ }
@@ -0,0 +1,100 @@
1
+ """Script for snp.PlinkFilter"""
2
+
3
+ from pathlib import Path
4
+ from biopipen.utils.misc import run_command, dict_to_cli_args, logger
5
+
6
+ indir = {{in.indir | repr}} # pyright: ignore # noqa: #999
7
+ samples_file = {{in.samples_file | repr}} # pyright: ignore
8
+ variants_file = {{in.variants_file | repr}} # pyright: ignore
9
+ outdir = {{out.outdir | repr}} # pyright: ignore
10
+
11
+ plink = {{envs.plink | repr}} # pyright: ignore
12
+ ncores = {{envs.ncores | repr}} # pyright: ignore
13
+ samples = {{envs.samples | repr}} # pyright: ignore
14
+ variants = {{envs.variants | repr}} # pyright: ignore
15
+ e_samples_file = {{envs.samples_file | repr}} # pyright: ignore
16
+ e_variants_file = {{envs.variants_file | repr}} # pyright: ignore
17
+ keep = {{envs.keep | repr}} # pyright: ignore
18
+ vfile_type = {{envs.vfile_type | repr}} # pyright: ignore
19
+ chr = {{envs.chr | repr}} # pyright: ignore
20
+ not_chr = {{envs.not_chr | repr}} # pyright: ignore
21
+ autosome = {{envs.autosome | repr}} # pyright: ignore
22
+ autosome_xy = {{envs.autosome_xy | repr}} # pyright: ignore
23
+ snps_only = {{envs.snps_only | repr}} # pyright: ignore
24
+
25
+ samples_file = samples_file or e_samples_file
26
+ if not samples_file and samples:
27
+ samples_file = Path(outdir) / "_samples.txt"
28
+ if isinstance(samples, str):
29
+ samples = [s.strip() for s in samples.split(",")]
30
+
31
+ with open(samples_file, "w") as fh:
32
+ fh.writelines(
33
+ [
34
+ line.replace("/", "\t") + "\n"
35
+ if "/" in line
36
+ else line + "\t" + line + "\n"
37
+ for line in samples
38
+ ]
39
+ )
40
+
41
+ variants_file = variants_file or e_variants_file
42
+ if not variants_file and variants:
43
+ if vfile_type != "id":
44
+ logger.warning(
45
+ "envs.vfile_type should be 'id' if only envs.variants is provided."
46
+ )
47
+ vfile_type = "id"
48
+
49
+ variants_file = Path(outdir) / "_variants.txt"
50
+ if isinstance(variants, str):
51
+ variants = [v.strip() for v in variants.split(",")]
52
+
53
+ with open(variants_file, "w") as fh:
54
+ fh.writelines([line + "\n" for line in variants])
55
+
56
+ bedfile = list(Path(indir).glob("*.bed"))
57
+ if len(bedfile) == 0:
58
+ raise FileNotFoundError(f"No .bed file found in `in.indir`")
59
+ elif len(bedfile) > 1:
60
+ logger.warning(f"Multiple .bed files found in `in.indir`, using the first one.")
61
+
62
+ bedfile = bedfile[0]
63
+ input = bedfile.with_suffix("")
64
+ output = Path(outdir) / bedfile.stem
65
+
66
+ args = {
67
+ "": [plink],
68
+ "bfile": input,
69
+ "out": output,
70
+ "threads": ncores,
71
+ "make-bed": True,
72
+ }
73
+
74
+ if keep:
75
+ if samples_file:
76
+ args["keep"] = samples_file
77
+ if variants_file:
78
+ args["extract"] = (
79
+ variants_file if vfile_type == "id" else [vfile_type, variants_file]
80
+ )
81
+ else:
82
+ if samples_file:
83
+ args["remove"] = samples_file
84
+ if variants_file:
85
+ args["exclude"] = (
86
+ variants_file if vfile_type == "id" else [vfile_type, variants_file]
87
+ )
88
+
89
+ if chr:
90
+ args["chr"] = chr
91
+ if not_chr:
92
+ args["not_chr"] = not_chr
93
+ if autosome:
94
+ args["autosome"] = True
95
+ if autosome_xy:
96
+ args["autosome"] = True
97
+ if snps_only:
98
+ args["snps_only"] = snps_only
99
+
100
+ run_command(dict_to_cli_args(args, dashify=True, dup_key=False), fg=True)