biopipen 0.21.0__py3-none-any.whl → 0.34.26__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- biopipen/__init__.py +1 -1
- biopipen/core/config.toml +28 -0
- biopipen/core/filters.py +79 -4
- biopipen/core/proc.py +12 -3
- biopipen/core/testing.py +75 -3
- biopipen/ns/bam.py +148 -6
- biopipen/ns/bed.py +75 -0
- biopipen/ns/cellranger.py +186 -0
- biopipen/ns/cellranger_pipeline.py +126 -0
- biopipen/ns/cnv.py +19 -3
- biopipen/ns/cnvkit.py +1 -1
- biopipen/ns/cnvkit_pipeline.py +20 -12
- biopipen/ns/delim.py +34 -35
- biopipen/ns/gene.py +68 -23
- biopipen/ns/gsea.py +63 -37
- biopipen/ns/misc.py +39 -14
- biopipen/ns/plot.py +304 -1
- biopipen/ns/protein.py +183 -0
- biopipen/ns/regulatory.py +290 -0
- biopipen/ns/rnaseq.py +142 -5
- biopipen/ns/scrna.py +2053 -473
- biopipen/ns/scrna_metabolic_landscape.py +228 -382
- biopipen/ns/snp.py +659 -0
- biopipen/ns/stats.py +484 -0
- biopipen/ns/tcr.py +683 -98
- biopipen/ns/vcf.py +236 -2
- biopipen/ns/web.py +97 -6
- biopipen/reports/bam/CNVpytor.svelte +4 -9
- biopipen/reports/cellranger/CellRangerCount.svelte +18 -0
- biopipen/reports/cellranger/CellRangerSummary.svelte +16 -0
- biopipen/reports/cellranger/CellRangerVdj.svelte +18 -0
- biopipen/reports/cnvkit/CNVkitDiagram.svelte +1 -1
- biopipen/reports/cnvkit/CNVkitHeatmap.svelte +1 -1
- biopipen/reports/cnvkit/CNVkitScatter.svelte +1 -1
- biopipen/reports/common.svelte +15 -0
- biopipen/reports/protein/ProdigySummary.svelte +16 -0
- biopipen/reports/scrna/CellsDistribution.svelte +4 -39
- biopipen/reports/scrna/DimPlots.svelte +1 -1
- biopipen/reports/scrna/MarkersFinder.svelte +6 -126
- biopipen/reports/scrna/MetaMarkers.svelte +3 -75
- biopipen/reports/scrna/RadarPlots.svelte +4 -20
- biopipen/reports/scrna_metabolic_landscape/MetabolicFeatures.svelte +61 -22
- biopipen/reports/scrna_metabolic_landscape/MetabolicPathwayActivity.svelte +88 -82
- biopipen/reports/scrna_metabolic_landscape/MetabolicPathwayHeterogeneity.svelte +70 -10
- biopipen/reports/snp/PlinkCallRate.svelte +24 -0
- biopipen/reports/snp/PlinkFreq.svelte +18 -0
- biopipen/reports/snp/PlinkHWE.svelte +18 -0
- biopipen/reports/snp/PlinkHet.svelte +18 -0
- biopipen/reports/snp/PlinkIBD.svelte +18 -0
- biopipen/reports/tcr/CDR3AAPhyschem.svelte +19 -66
- biopipen/reports/tcr/ClonalStats.svelte +16 -0
- biopipen/reports/tcr/CloneResidency.svelte +3 -93
- biopipen/reports/tcr/Immunarch.svelte +4 -155
- biopipen/reports/tcr/TCRClusterStats.svelte +3 -45
- biopipen/reports/tcr/TESSA.svelte +11 -28
- biopipen/reports/utils/misc.liq +22 -7
- biopipen/scripts/bam/BamMerge.py +11 -15
- biopipen/scripts/bam/BamSampling.py +90 -0
- biopipen/scripts/bam/BamSort.py +141 -0
- biopipen/scripts/bam/BamSplitChroms.py +10 -10
- biopipen/scripts/bam/BamSubsetByBed.py +38 -0
- biopipen/scripts/bam/CNAClinic.R +41 -5
- biopipen/scripts/bam/CNVpytor.py +153 -54
- biopipen/scripts/bam/ControlFREEC.py +13 -14
- biopipen/scripts/bam/SamtoolsView.py +33 -0
- biopipen/scripts/bed/Bed2Vcf.py +5 -5
- biopipen/scripts/bed/BedConsensus.py +5 -5
- biopipen/scripts/bed/BedLiftOver.sh +6 -4
- biopipen/scripts/bed/BedtoolsIntersect.py +54 -0
- biopipen/scripts/bed/BedtoolsMakeWindows.py +47 -0
- biopipen/scripts/bed/BedtoolsMerge.py +4 -4
- biopipen/scripts/cellranger/CellRangerCount.py +138 -0
- biopipen/scripts/cellranger/CellRangerSummary.R +181 -0
- biopipen/scripts/cellranger/CellRangerVdj.py +112 -0
- biopipen/scripts/cnv/AneuploidyScore.R +55 -20
- biopipen/scripts/cnv/AneuploidyScoreSummary.R +221 -163
- biopipen/scripts/cnv/TMADScore.R +25 -9
- biopipen/scripts/cnv/TMADScoreSummary.R +57 -86
- biopipen/scripts/cnvkit/CNVkitAccess.py +7 -6
- biopipen/scripts/cnvkit/CNVkitAutobin.py +26 -18
- biopipen/scripts/cnvkit/CNVkitBatch.py +6 -6
- biopipen/scripts/cnvkit/CNVkitCall.py +3 -3
- biopipen/scripts/cnvkit/CNVkitCoverage.py +4 -3
- biopipen/scripts/cnvkit/CNVkitDiagram.py +5 -5
- biopipen/scripts/cnvkit/CNVkitFix.py +3 -3
- biopipen/scripts/cnvkit/CNVkitGuessBaits.py +12 -8
- biopipen/scripts/cnvkit/CNVkitHeatmap.py +5 -5
- biopipen/scripts/cnvkit/CNVkitReference.py +6 -5
- biopipen/scripts/cnvkit/CNVkitScatter.py +5 -5
- biopipen/scripts/cnvkit/CNVkitSegment.py +5 -5
- biopipen/scripts/cnvkit/guess_baits.py +166 -93
- biopipen/scripts/delim/RowsBinder.R +1 -1
- biopipen/scripts/delim/SampleInfo.R +116 -118
- biopipen/scripts/gene/GeneNameConversion.R +67 -0
- biopipen/scripts/gene/GenePromoters.R +61 -0
- biopipen/scripts/gsea/Enrichr.R +5 -5
- biopipen/scripts/gsea/FGSEA.R +184 -50
- biopipen/scripts/gsea/GSEA.R +2 -2
- biopipen/scripts/gsea/PreRank.R +5 -5
- biopipen/scripts/misc/Config2File.py +2 -2
- biopipen/scripts/misc/Plot.R +80 -0
- biopipen/scripts/misc/Shell.sh +15 -0
- biopipen/scripts/misc/Str2File.py +2 -2
- biopipen/scripts/plot/Heatmap.R +3 -3
- biopipen/scripts/plot/Manhattan.R +147 -0
- biopipen/scripts/plot/QQPlot.R +146 -0
- biopipen/scripts/plot/ROC.R +88 -0
- biopipen/scripts/plot/Scatter.R +112 -0
- biopipen/scripts/plot/VennDiagram.R +5 -9
- biopipen/scripts/protein/MMCIF2PDB.py +33 -0
- biopipen/scripts/protein/PDB2Fasta.py +60 -0
- biopipen/scripts/protein/Prodigy.py +119 -0
- biopipen/scripts/protein/ProdigySummary.R +140 -0
- biopipen/scripts/protein/RMSD.py +178 -0
- biopipen/scripts/regulatory/MotifAffinityTest.R +102 -0
- biopipen/scripts/regulatory/MotifAffinityTest_AtSNP.R +127 -0
- biopipen/scripts/regulatory/MotifAffinityTest_MotifBreakR.R +104 -0
- biopipen/scripts/regulatory/MotifScan.py +159 -0
- biopipen/scripts/regulatory/VariantMotifPlot.R +78 -0
- biopipen/scripts/regulatory/motifs-common.R +324 -0
- biopipen/scripts/rnaseq/Simulation-ESCO.R +180 -0
- biopipen/scripts/rnaseq/Simulation-RUVcorr.R +45 -0
- biopipen/scripts/rnaseq/Simulation.R +21 -0
- biopipen/scripts/rnaseq/UnitConversion.R +325 -54
- biopipen/scripts/scrna/AnnData2Seurat.R +40 -0
- biopipen/scripts/scrna/CCPlotR-patch.R +161 -0
- biopipen/scripts/scrna/CellCellCommunication.py +150 -0
- biopipen/scripts/scrna/CellCellCommunicationPlots.R +93 -0
- biopipen/scripts/scrna/CellSNPLite.py +30 -0
- biopipen/scripts/scrna/CellTypeAnnotation-celltypist.R +185 -0
- biopipen/scripts/scrna/CellTypeAnnotation-direct.R +68 -31
- biopipen/scripts/scrna/CellTypeAnnotation-hitype.R +27 -22
- biopipen/scripts/scrna/CellTypeAnnotation-sccatch.R +28 -20
- biopipen/scripts/scrna/CellTypeAnnotation-sctype.R +48 -25
- biopipen/scripts/scrna/CellTypeAnnotation.R +37 -1
- biopipen/scripts/scrna/CellsDistribution.R +456 -167
- biopipen/scripts/scrna/DimPlots.R +1 -1
- biopipen/scripts/scrna/ExprImputation-alra.R +109 -0
- biopipen/scripts/scrna/ExprImputation-rmagic.R +256 -0
- biopipen/scripts/scrna/{ExprImpution-scimpute.R → ExprImputation-scimpute.R} +8 -5
- biopipen/scripts/scrna/ExprImputation.R +7 -0
- biopipen/scripts/scrna/LoomTo10X.R +51 -0
- biopipen/scripts/scrna/MQuad.py +25 -0
- biopipen/scripts/scrna/MarkersFinder.R +679 -400
- biopipen/scripts/scrna/MetaMarkers.R +265 -161
- biopipen/scripts/scrna/ModuleScoreCalculator.R +66 -11
- biopipen/scripts/scrna/PseudoBulkDEG.R +678 -0
- biopipen/scripts/scrna/RadarPlots.R +355 -134
- biopipen/scripts/scrna/ScFGSEA.R +298 -100
- biopipen/scripts/scrna/ScSimulation.R +65 -0
- biopipen/scripts/scrna/ScVelo.py +617 -0
- biopipen/scripts/scrna/Seurat2AnnData.R +7 -0
- biopipen/scripts/scrna/SeuratClusterStats-clustree.R +87 -0
- biopipen/scripts/scrna/SeuratClusterStats-dimplots.R +36 -30
- biopipen/scripts/scrna/SeuratClusterStats-features.R +138 -187
- biopipen/scripts/scrna/SeuratClusterStats-ngenes.R +81 -0
- biopipen/scripts/scrna/SeuratClusterStats-stats.R +78 -89
- biopipen/scripts/scrna/SeuratClusterStats.R +47 -10
- biopipen/scripts/scrna/SeuratClustering.R +36 -233
- biopipen/scripts/scrna/SeuratLoading.R +2 -2
- biopipen/scripts/scrna/SeuratMap2Ref.R +84 -113
- biopipen/scripts/scrna/SeuratMetadataMutater.R +16 -6
- biopipen/scripts/scrna/SeuratPreparing.R +223 -173
- biopipen/scripts/scrna/SeuratSubClustering.R +64 -0
- biopipen/scripts/scrna/SeuratTo10X.R +27 -0
- biopipen/scripts/scrna/Slingshot.R +65 -0
- biopipen/scripts/scrna/Subset10X.R +2 -2
- biopipen/scripts/scrna/TopExpressingGenes.R +169 -135
- biopipen/scripts/scrna/celltypist-wrapper.py +195 -0
- biopipen/scripts/scrna/scvelo_paga.py +313 -0
- biopipen/scripts/scrna/seurat_anndata_conversion.py +98 -0
- biopipen/scripts/scrna_metabolic_landscape/MetabolicFeatures.R +447 -82
- biopipen/scripts/scrna_metabolic_landscape/MetabolicPathwayActivity.R +348 -241
- biopipen/scripts/scrna_metabolic_landscape/MetabolicPathwayHeterogeneity.R +188 -166
- biopipen/scripts/snp/MatrixEQTL.R +217 -0
- biopipen/scripts/snp/Plink2GTMat.py +148 -0
- biopipen/scripts/snp/PlinkCallRate.R +199 -0
- biopipen/scripts/snp/PlinkFilter.py +100 -0
- biopipen/scripts/snp/PlinkFreq.R +291 -0
- biopipen/scripts/snp/PlinkFromVcf.py +81 -0
- biopipen/scripts/snp/PlinkHWE.R +85 -0
- biopipen/scripts/snp/PlinkHet.R +96 -0
- biopipen/scripts/snp/PlinkIBD.R +196 -0
- biopipen/scripts/snp/PlinkSimulation.py +124 -0
- biopipen/scripts/snp/PlinkUpdateName.py +124 -0
- biopipen/scripts/stats/ChowTest.R +146 -0
- biopipen/scripts/stats/DiffCoexpr.R +152 -0
- biopipen/scripts/stats/LiquidAssoc.R +135 -0
- biopipen/scripts/stats/Mediation.R +108 -0
- biopipen/scripts/stats/MetaPvalue.R +130 -0
- biopipen/scripts/stats/MetaPvalue1.R +74 -0
- biopipen/scripts/tcgamaf/Maf2Vcf.py +2 -2
- biopipen/scripts/tcgamaf/MafAddChr.py +2 -2
- biopipen/scripts/tcr/Attach2Seurat.R +3 -2
- biopipen/scripts/tcr/CDR3AAPhyschem.R +211 -143
- biopipen/scripts/tcr/CDR3Clustering.R +343 -0
- biopipen/scripts/tcr/ClonalStats.R +526 -0
- biopipen/scripts/tcr/CloneResidency.R +255 -131
- biopipen/scripts/tcr/CloneSizeQQPlot.R +4 -4
- biopipen/scripts/tcr/GIANA/GIANA.py +1356 -797
- biopipen/scripts/tcr/GIANA/GIANA4.py +1362 -789
- biopipen/scripts/tcr/GIANA/query.py +164 -162
- biopipen/scripts/tcr/Immunarch-basic.R +31 -9
- biopipen/scripts/tcr/Immunarch-clonality.R +25 -5
- biopipen/scripts/tcr/Immunarch-diversity.R +352 -134
- biopipen/scripts/tcr/Immunarch-geneusage.R +45 -5
- biopipen/scripts/tcr/Immunarch-kmer.R +68 -8
- biopipen/scripts/tcr/Immunarch-overlap.R +84 -4
- biopipen/scripts/tcr/Immunarch-spectratyping.R +35 -6
- biopipen/scripts/tcr/Immunarch-tracking.R +38 -6
- biopipen/scripts/tcr/Immunarch-vjjunc.R +165 -0
- biopipen/scripts/tcr/Immunarch.R +63 -11
- biopipen/scripts/tcr/Immunarch2VDJtools.R +2 -2
- biopipen/scripts/tcr/ImmunarchFilter.R +4 -4
- biopipen/scripts/tcr/ImmunarchLoading.R +38 -29
- biopipen/scripts/tcr/SampleDiversity.R +1 -1
- biopipen/scripts/tcr/ScRepCombiningExpression.R +40 -0
- biopipen/scripts/tcr/ScRepLoading.R +166 -0
- biopipen/scripts/tcr/TCRClusterStats.R +176 -22
- biopipen/scripts/tcr/TCRDock.py +110 -0
- biopipen/scripts/tcr/TESSA.R +102 -118
- biopipen/scripts/tcr/VJUsage.R +5 -5
- biopipen/scripts/tcr/immunarch-patched.R +142 -0
- biopipen/scripts/tcr/vdjtools-patch.sh +1 -1
- biopipen/scripts/vcf/BcftoolsAnnotate.py +91 -0
- biopipen/scripts/vcf/BcftoolsFilter.py +90 -0
- biopipen/scripts/vcf/BcftoolsMerge.py +31 -0
- biopipen/scripts/vcf/BcftoolsSort.py +113 -0
- biopipen/scripts/vcf/BcftoolsView.py +73 -0
- biopipen/scripts/vcf/TruvariBench.sh +14 -7
- biopipen/scripts/vcf/TruvariBenchSummary.R +16 -13
- biopipen/scripts/vcf/TruvariConsistency.R +1 -1
- biopipen/scripts/vcf/Vcf2Bed.py +2 -2
- biopipen/scripts/vcf/VcfAnno.py +11 -11
- biopipen/scripts/vcf/VcfDownSample.sh +22 -10
- biopipen/scripts/vcf/VcfFilter.py +5 -5
- biopipen/scripts/vcf/VcfFix.py +7 -7
- biopipen/scripts/vcf/VcfFix_utils.py +13 -4
- biopipen/scripts/vcf/VcfIndex.py +3 -3
- biopipen/scripts/vcf/VcfIntersect.py +3 -3
- biopipen/scripts/vcf/VcfLiftOver.sh +5 -0
- biopipen/scripts/vcf/VcfSplitSamples.py +4 -4
- biopipen/scripts/vcf/bcftools_utils.py +52 -0
- biopipen/scripts/web/Download.py +8 -4
- biopipen/scripts/web/DownloadList.py +5 -5
- biopipen/scripts/web/GCloudStorageDownloadBucket.py +82 -0
- biopipen/scripts/web/GCloudStorageDownloadFile.py +23 -0
- biopipen/scripts/web/gcloud_common.py +49 -0
- biopipen/utils/gene.py +108 -60
- biopipen/utils/misc.py +146 -20
- biopipen/utils/reference.py +64 -20
- biopipen/utils/reporter.py +177 -0
- biopipen/utils/vcf.py +1 -1
- biopipen-0.34.26.dist-info/METADATA +27 -0
- biopipen-0.34.26.dist-info/RECORD +292 -0
- {biopipen-0.21.0.dist-info → biopipen-0.34.26.dist-info}/WHEEL +1 -1
- {biopipen-0.21.0.dist-info → biopipen-0.34.26.dist-info}/entry_points.txt +6 -2
- biopipen/ns/bcftools.py +0 -111
- biopipen/ns/scrna_basic.py +0 -255
- biopipen/reports/delim/SampleInfo.svelte +0 -36
- biopipen/reports/scrna/GeneExpressionInvistigation.svelte +0 -32
- biopipen/reports/scrna/ScFGSEA.svelte +0 -35
- biopipen/reports/scrna/SeuratClusterStats.svelte +0 -82
- biopipen/reports/scrna/SeuratMap2Ref.svelte +0 -20
- biopipen/reports/scrna/SeuratPreparing.svelte +0 -38
- biopipen/reports/scrna/TopExpressingGenes.svelte +0 -55
- biopipen/reports/scrna_metabolic_landscape/MetabolicFeaturesIntraSubset.svelte +0 -31
- biopipen/reports/utils/gsea.liq +0 -110
- biopipen/scripts/bcftools/BcftoolsAnnotate.py +0 -42
- biopipen/scripts/bcftools/BcftoolsFilter.py +0 -79
- biopipen/scripts/bcftools/BcftoolsSort.py +0 -19
- biopipen/scripts/gene/GeneNameConversion.py +0 -66
- biopipen/scripts/scrna/ExprImpution-alra.R +0 -32
- biopipen/scripts/scrna/ExprImpution-rmagic.R +0 -29
- biopipen/scripts/scrna/ExprImpution.R +0 -7
- biopipen/scripts/scrna/GeneExpressionInvistigation.R +0 -132
- biopipen/scripts/scrna/Write10X.R +0 -11
- biopipen/scripts/scrna_metabolic_landscape/MetabolicFeaturesIntraSubset.R +0 -150
- biopipen/scripts/tcr/TCRClustering.R +0 -280
- biopipen/utils/common_docstrs.py +0 -61
- biopipen/utils/gene.R +0 -49
- biopipen/utils/gsea.R +0 -193
- biopipen/utils/io.R +0 -20
- biopipen/utils/misc.R +0 -114
- biopipen/utils/mutate_helpers.R +0 -433
- biopipen/utils/plot.R +0 -173
- biopipen/utils/rnaseq.R +0 -48
- biopipen/utils/single_cell.R +0 -115
- biopipen-0.21.0.dist-info/METADATA +0 -22
- biopipen-0.21.0.dist-info/RECORD +0 -218
|
@@ -0,0 +1,148 @@
|
|
|
1
|
+
|
|
2
|
+
from os import path
|
|
3
|
+
from glob import glob
|
|
4
|
+
from biopipen.utils.misc import run_command, logger
|
|
5
|
+
|
|
6
|
+
indir: str = {{in.indir | quote}} # noqa: E999 # pyright: ignore
|
|
7
|
+
outfile: str = {{out.outfile | quote}} # pyright: ignore
|
|
8
|
+
plink: str = {{envs.plink | quote}} # pyright: ignore
|
|
9
|
+
ncores: int = {{envs.ncores | repr}} # pyright: ignore
|
|
10
|
+
transpose: bool = {{envs.transpose | repr}} # pyright: ignore
|
|
11
|
+
samid: str = {{envs.samid | repr}} # pyright: ignore
|
|
12
|
+
varid: str = {{envs.varid | repr}} # pyright: ignore
|
|
13
|
+
trans_chr: dict = {{envs.trans_chr | repr}} # pyright: ignore
|
|
14
|
+
missing_id: str = {{envs.missing_id | repr}} # pyright: ignore
|
|
15
|
+
gtcoding: str = {{envs.gtcoding | repr}} # pyright: ignore
|
|
16
|
+
trans_chr = trans_chr or {}
|
|
17
|
+
|
|
18
|
+
bedfile = glob(path.join(indir, '*.bed'))
|
|
19
|
+
if len(bedfile) == 0:
|
|
20
|
+
raise FileNotFoundError(f"No .bed file found in `in.indir`")
|
|
21
|
+
elif len(bedfile) > 1:
|
|
22
|
+
logger.warning(f"Multiple .bed files found in `in.indir`, using the first one.")
|
|
23
|
+
|
|
24
|
+
bedfile = bedfile[0]
|
|
25
|
+
input = path.splitext(bedfile)[0]
|
|
26
|
+
output = path.splitext(outfile)[0]
|
|
27
|
+
|
|
28
|
+
cmd = [
|
|
29
|
+
plink,
|
|
30
|
+
"--bfile", input,
|
|
31
|
+
"--out", output,
|
|
32
|
+
"--threads", ncores,
|
|
33
|
+
"--keep-allele-order",
|
|
34
|
+
"--recode", "A-transpose" if not transpose else "A",
|
|
35
|
+
]
|
|
36
|
+
# if transpose:
|
|
37
|
+
# cmd += ["tabx"]
|
|
38
|
+
|
|
39
|
+
run_command(cmd, fg=True, env={"cwd": path.dirname(outfile)})
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def _vcf_gtcoding(gt):
|
|
43
|
+
try:
|
|
44
|
+
return str(2 - int(gt))
|
|
45
|
+
except (ValueError, TypeError):
|
|
46
|
+
return "NA"
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
if not transpose: # rows are variants, columns are samples
|
|
50
|
+
# .traw file is created, tab-separated, with the following columns:
|
|
51
|
+
trawfile = output + ".traw"
|
|
52
|
+
# CHR Chromosome code
|
|
53
|
+
# SNP Variant identifier
|
|
54
|
+
# (C)M Position in morgans or centimorgans
|
|
55
|
+
# POS Base-pair coordinate
|
|
56
|
+
# COUNTED Counted allele (defaults to A1), the actual alternative allele
|
|
57
|
+
# with --keep-allele-order
|
|
58
|
+
# ALT Other allele(s), comma-separated, the actual reference allele
|
|
59
|
+
# <FID>_<IID>... Allelic dosages
|
|
60
|
+
# (0/1/2/'NA' for diploid variants, 0/2/'NA' for haploid)
|
|
61
|
+
with open(trawfile, 'r') as fin:
|
|
62
|
+
with open(outfile, 'w') as fout:
|
|
63
|
+
samples = fin.readline().strip().split('\t')[6:]
|
|
64
|
+
header = ["Variant"]
|
|
65
|
+
for sam in samples:
|
|
66
|
+
try:
|
|
67
|
+
fid, iid = sam.split('_')
|
|
68
|
+
except ValueError:
|
|
69
|
+
raise ValueError(
|
|
70
|
+
f"Can't determine FID and IID from sample ID: {sam}, "
|
|
71
|
+
f"extra underscore (_) detected."
|
|
72
|
+
) from None
|
|
73
|
+
sam = samid.replace('{fid}', fid).replace('{iid}', iid)
|
|
74
|
+
header.append(sam)
|
|
75
|
+
fout.write('\t'.join(header) + '\n')
|
|
76
|
+
|
|
77
|
+
for line in fin:
|
|
78
|
+
line = line.strip().split('\t')
|
|
79
|
+
chrom = trans_chr.get(line[0], line[0])
|
|
80
|
+
var = line[1]
|
|
81
|
+
if var == "." or var == "":
|
|
82
|
+
var = missing_id
|
|
83
|
+
pos = line[3]
|
|
84
|
+
ref = line[5]
|
|
85
|
+
alt = line[4]
|
|
86
|
+
variant = (
|
|
87
|
+
varid
|
|
88
|
+
.replace('{chr}', chrom)
|
|
89
|
+
.replace('{varid}', var)
|
|
90
|
+
.replace('{pos}', pos)
|
|
91
|
+
.replace('{ref}', ref)
|
|
92
|
+
.replace('{alt}', alt)
|
|
93
|
+
)
|
|
94
|
+
if gtcoding == "plink":
|
|
95
|
+
record = [variant] + line[6:]
|
|
96
|
+
else: # vcf
|
|
97
|
+
record = [variant] + [_vcf_gtcoding(x) for x in line[6:]]
|
|
98
|
+
fout.write('\t'.join(record) + '\n')
|
|
99
|
+
|
|
100
|
+
else:
|
|
101
|
+
# .raw file is created, tab-separated, with the following columns:
|
|
102
|
+
rawfile = output + ".raw"
|
|
103
|
+
# FID Family ID
|
|
104
|
+
# IID Individual ID
|
|
105
|
+
# PAT Paternal ID
|
|
106
|
+
# MAT Maternal ID
|
|
107
|
+
# SEX Sex (1 = male, 2 = female, 0 = unknown)
|
|
108
|
+
# PHENOTYPE Main phenotype value
|
|
109
|
+
# <VariantID>... Allelic dosage (0/1/2/NA for diploid variants, 0/2/NA for haploid)
|
|
110
|
+
#
|
|
111
|
+
# Variant information may not be included in <VariantID>
|
|
112
|
+
# We use the .bim file to get the variant information
|
|
113
|
+
bimfile = input + ".bim"
|
|
114
|
+
with open(rawfile, 'r') as fin:
|
|
115
|
+
with open(outfile, 'w') as fout:
|
|
116
|
+
header = ["Sample"]
|
|
117
|
+
with open(bimfile, 'r') as fbim:
|
|
118
|
+
for line in fbim:
|
|
119
|
+
line = line.strip().split('\t')
|
|
120
|
+
chrom = trans_chr.get(line[0], line[0])
|
|
121
|
+
var = line[1]
|
|
122
|
+
if var == "." or var == "":
|
|
123
|
+
var = missing_id
|
|
124
|
+
pos = line[3]
|
|
125
|
+
ref = line[5]
|
|
126
|
+
alt = line[4]
|
|
127
|
+
variant = (
|
|
128
|
+
varid
|
|
129
|
+
.replace('{chr}', chrom)
|
|
130
|
+
.replace('{varid}', var)
|
|
131
|
+
.replace('{pos}', pos)
|
|
132
|
+
.replace('{ref}', ref)
|
|
133
|
+
.replace('{alt}', alt)
|
|
134
|
+
)
|
|
135
|
+
header.append(variant)
|
|
136
|
+
fout.write('\t'.join(header) + '\n')
|
|
137
|
+
|
|
138
|
+
next(fin) # skip header
|
|
139
|
+
for line in fin:
|
|
140
|
+
line = line.strip().split('\t')
|
|
141
|
+
fid = line[0]
|
|
142
|
+
iid = line[1]
|
|
143
|
+
sam = samid.replace('{fid}', fid).replace('{iid}', iid)
|
|
144
|
+
if gtcoding == "plink":
|
|
145
|
+
record = [sam] + line[6:]
|
|
146
|
+
else: # vcf
|
|
147
|
+
record = [sam] + [_vcf_gtcoding(x) for x in line[6:]]
|
|
148
|
+
fout.write('\t'.join(record) + '\n')
|
|
@@ -0,0 +1,199 @@
|
|
|
1
|
+
library(plotthis)
|
|
2
|
+
library(biopipen.utils)
|
|
3
|
+
|
|
4
|
+
indir <- {{in.indir | r}}
|
|
5
|
+
outdir <- {{out.outdir | r}}
|
|
6
|
+
plink <- {{envs.plink | r}}
|
|
7
|
+
ncores <- {{envs.ncores | r}}
|
|
8
|
+
doplot <- {{envs.plot | r}}
|
|
9
|
+
devpars <- {{envs.devpars | r}}
|
|
10
|
+
samplecr <- {{envs.samplecr | r}}
|
|
11
|
+
varcr <- {{envs.varcr | r}}
|
|
12
|
+
max_iter <- {{envs.max_iter | r}}
|
|
13
|
+
|
|
14
|
+
log <- get_logger()
|
|
15
|
+
|
|
16
|
+
bedfile = Sys.glob(file.path(indir, '*.bed'))
|
|
17
|
+
if (length(bedfile) == 0)
|
|
18
|
+
stop("No bed files found in the input directory.")
|
|
19
|
+
if (length(bedfile) > 1) {
|
|
20
|
+
log$warn("Multiple bed files found in the input directory. Using the first one.")
|
|
21
|
+
bedfile <- bedfile[1]
|
|
22
|
+
}
|
|
23
|
+
input <- tools::file_path_sans_ext(bedfile)
|
|
24
|
+
output <- file.path(outdir, basename(input))
|
|
25
|
+
|
|
26
|
+
all_smiss_file = paste0(output, '.smiss')
|
|
27
|
+
all_vmiss_file = paste0(output, '.vmiss')
|
|
28
|
+
all_samplecr_fail_file = paste0(output, '.samplecr.fail')
|
|
29
|
+
all_varcr_fail_file = paste0(output, '.varcr.fail')
|
|
30
|
+
if (file.exists(all_smiss_file)) invisible(file.remove(all_smiss_file))
|
|
31
|
+
if (file.exists(all_vmiss_file)) invisible(file.remove(all_vmiss_file))
|
|
32
|
+
for (i in 1:max_iter) {
|
|
33
|
+
log$info("Iteration {i} ...")
|
|
34
|
+
# iter_out <- paste0(output, "-", i)
|
|
35
|
+
iter_dir <- file.path(outdir, paste0("iter", i))
|
|
36
|
+
dir.create(iter_dir, showWarnings = FALSE)
|
|
37
|
+
iter_out <- file.path(iter_dir, basename(output))
|
|
38
|
+
cmd <- c(
|
|
39
|
+
plink,
|
|
40
|
+
"--threads", ncores,
|
|
41
|
+
"--bfile", input,
|
|
42
|
+
"--missing",
|
|
43
|
+
"--out", iter_out
|
|
44
|
+
)
|
|
45
|
+
run_command(cmd, fg = TRUE)
|
|
46
|
+
|
|
47
|
+
smissfile <- paste0(iter_out, '.smiss')
|
|
48
|
+
smiss <- read.table(
|
|
49
|
+
smissfile,
|
|
50
|
+
header = TRUE,
|
|
51
|
+
row.names = NULL,
|
|
52
|
+
check.names = FALSE,
|
|
53
|
+
comment.char = ""
|
|
54
|
+
)
|
|
55
|
+
smiss$Iteration <- i
|
|
56
|
+
# append it to all_smiss_file
|
|
57
|
+
write.table(
|
|
58
|
+
smiss,
|
|
59
|
+
all_smiss_file,
|
|
60
|
+
append = i > 1,
|
|
61
|
+
col.names = !file.exists(all_smiss_file),
|
|
62
|
+
row.names = FALSE,
|
|
63
|
+
sep = "\t",
|
|
64
|
+
quote = FALSE
|
|
65
|
+
)
|
|
66
|
+
callrate.sample <- data.frame(Callrate = 1 - smiss$F_MISS)
|
|
67
|
+
rownames(callrate.sample) <- paste(smiss$FID, smiss$IID, sep = "\t")
|
|
68
|
+
callrate.sample.fail = rownames(callrate.sample[
|
|
69
|
+
callrate.sample$Callrate < samplecr, , drop = FALSE
|
|
70
|
+
])
|
|
71
|
+
writeLines(callrate.sample.fail, con = file(paste0(iter_out, '.samplecr.fail')))
|
|
72
|
+
# append it to all_samplecr_fail_file
|
|
73
|
+
write(
|
|
74
|
+
paste0(sapply(
|
|
75
|
+
callrate.sample.fail,
|
|
76
|
+
function(x){ paste0(x, "\n") }
|
|
77
|
+
), collapse = ""),
|
|
78
|
+
file = file(all_samplecr_fail_file),
|
|
79
|
+
append = i > 1
|
|
80
|
+
)
|
|
81
|
+
|
|
82
|
+
vmiss <- read.table(
|
|
83
|
+
paste0(iter_out, '.vmiss'),
|
|
84
|
+
header = TRUE,
|
|
85
|
+
row.names = NULL,
|
|
86
|
+
check.names = FALSE,
|
|
87
|
+
comment.char = ""
|
|
88
|
+
)
|
|
89
|
+
vmiss$Iteration <- i
|
|
90
|
+
# append it to all_vmiss_file
|
|
91
|
+
write.table(
|
|
92
|
+
vmiss,
|
|
93
|
+
all_vmiss_file,
|
|
94
|
+
append = i > 1,
|
|
95
|
+
col.names = !file.exists(all_vmiss_file),
|
|
96
|
+
row.names = FALSE,
|
|
97
|
+
sep = "\t",
|
|
98
|
+
quote = FALSE
|
|
99
|
+
)
|
|
100
|
+
vmiss$Callrate <- 1 - vmiss$F_MISS
|
|
101
|
+
callrate.var.fail <- vmiss[which(vmiss$Callrate < varcr), 'ID', drop = TRUE]
|
|
102
|
+
writeLines(callrate.var.fail, con = file(paste0(iter_out, '.varcr.fail')))
|
|
103
|
+
# append it to all_varcr_fail_file
|
|
104
|
+
write(
|
|
105
|
+
paste0(sapply(
|
|
106
|
+
callrate.var.fail,
|
|
107
|
+
function(x){ paste0(x, "\n") }
|
|
108
|
+
), collapse = ""),
|
|
109
|
+
file = file(all_varcr_fail_file),
|
|
110
|
+
append = i > 1
|
|
111
|
+
)
|
|
112
|
+
|
|
113
|
+
if (length(callrate.sample.fail) == 0 && length(callrate.var.fail) == 0) {
|
|
114
|
+
# make symbolic links to output from input .bed, .bim and .fam files
|
|
115
|
+
file.symlink(paste0(input, '.bed'), paste0(output, '.bed'))
|
|
116
|
+
file.symlink(paste0(input, '.bim'), paste0(output, '.bim'))
|
|
117
|
+
file.symlink(paste0(input, '.fam'), paste0(output, '.fam'))
|
|
118
|
+
break
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
# remove samples in iter_out.samplecr.fail and variants in iter_out.varcr.fail
|
|
122
|
+
cmd <- c(
|
|
123
|
+
plink,
|
|
124
|
+
"--threads", ncores,
|
|
125
|
+
"--bfile", input,
|
|
126
|
+
"--remove", paste0(iter_out, '.samplecr.fail'),
|
|
127
|
+
"--exclude", paste0(iter_out, '.varcr.fail'),
|
|
128
|
+
"--make-bed",
|
|
129
|
+
"--out", iter_out
|
|
130
|
+
)
|
|
131
|
+
run_command(cmd, fg = TRUE)
|
|
132
|
+
input <- iter_out
|
|
133
|
+
}
|
|
134
|
+
|
|
135
|
+
smiss <- read.table(
|
|
136
|
+
smissfile,
|
|
137
|
+
header = TRUE,
|
|
138
|
+
row.names = NULL,
|
|
139
|
+
check.names = FALSE,
|
|
140
|
+
comment.char = ""
|
|
141
|
+
)
|
|
142
|
+
callrate.sample <- data.frame(Callrate = 1 - smiss$F_MISS)
|
|
143
|
+
rownames(callrate.sample) <- paste(smiss$FID, smiss$IID, sep = "\t")
|
|
144
|
+
|
|
145
|
+
vmiss <- read.table(
|
|
146
|
+
paste0(iter_out, '.vmiss'),
|
|
147
|
+
header = TRUE,
|
|
148
|
+
row.names = NULL,
|
|
149
|
+
check.names = FALSE,
|
|
150
|
+
comment.char = ""
|
|
151
|
+
)
|
|
152
|
+
vmiss$Callrate <- 1 - vmiss$F_MISS
|
|
153
|
+
|
|
154
|
+
if (doplot) {
|
|
155
|
+
log$info("Plotting ...")
|
|
156
|
+
callrate.sample$Status <- "Pass"
|
|
157
|
+
callrate.sample[callrate.sample.fail, "Status"] <- "Fail"
|
|
158
|
+
callrate.sample$Status <- factor(callrate.sample$Status, levels = c("Fail", "Pass"))
|
|
159
|
+
|
|
160
|
+
p_callrate_file <- paste0(output, '.samplecr.png')
|
|
161
|
+
p_callrate <- Histogram(
|
|
162
|
+
callrate.sample,
|
|
163
|
+
x = "Callrate",
|
|
164
|
+
group_by = "Status",
|
|
165
|
+
xlab = "Sample Call Rate",
|
|
166
|
+
ylab = "Count",
|
|
167
|
+
palette = "Set1",
|
|
168
|
+
alpha = 0.8,
|
|
169
|
+
bins = 50
|
|
170
|
+
)
|
|
171
|
+
res <- 70
|
|
172
|
+
height <- attr(p_callrate, "height") * res
|
|
173
|
+
width <- attr(p_callrate, "width") * res
|
|
174
|
+
png(p_callrate_file, width = width, height = height, res = res)
|
|
175
|
+
print(p_callrate)
|
|
176
|
+
dev.off()
|
|
177
|
+
|
|
178
|
+
vmiss$Status <- "Pass"
|
|
179
|
+
vmiss[which(vmiss$Callrate < varcr), "Status"] <- "Fail"
|
|
180
|
+
vmiss$Status <- factor(vmiss$Status, levels = c("Fail", "Pass"))
|
|
181
|
+
|
|
182
|
+
p_varcr_file <- paste0(output, '.varcr.png')
|
|
183
|
+
p_varcr <- Histogram(
|
|
184
|
+
vmiss,
|
|
185
|
+
x = "Callrate",
|
|
186
|
+
group_by = "Status",
|
|
187
|
+
xlab = "Variant Call Rate",
|
|
188
|
+
ylab = "Count",
|
|
189
|
+
palette = "Set1",
|
|
190
|
+
alpha = 0.8,
|
|
191
|
+
bins = 50
|
|
192
|
+
)
|
|
193
|
+
res <- 70
|
|
194
|
+
height <- attr(p_varcr, "height") * res
|
|
195
|
+
width <- attr(p_varcr, "width") * res
|
|
196
|
+
png(p_varcr_file, width = width, height = height, res = res)
|
|
197
|
+
print(p_varcr)
|
|
198
|
+
dev.off()
|
|
199
|
+
}
|
|
@@ -0,0 +1,100 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from biopipen.utils.misc import run_command, dict_to_cli_args, logger
|
|
5
|
+
|
|
6
|
+
indir: str = {{in.indir | quote}} # pyright: ignore # noqa: #999
|
|
7
|
+
samples_file = {{in.samples_file | quote}} # pyright: ignore
|
|
8
|
+
variants_file = {{in.variants_file | quote}} # pyright: ignore
|
|
9
|
+
outdir: str = {{out.outdir | quote}} # pyright: ignore
|
|
10
|
+
|
|
11
|
+
plink = {{envs.plink | repr}} # pyright: ignore
|
|
12
|
+
ncores = {{envs.ncores | repr}} # pyright: ignore
|
|
13
|
+
samples: list[str] | str = {{envs.samples | repr}} # pyright: ignore
|
|
14
|
+
variants: list[str] | str = {{envs.variants | repr}} # pyright: ignore
|
|
15
|
+
e_samples_file = {{envs.samples_file | repr}} # pyright: ignore
|
|
16
|
+
e_variants_file = {{envs.variants_file | repr}} # pyright: ignore
|
|
17
|
+
keep = {{envs.keep | repr}} # pyright: ignore
|
|
18
|
+
vfile_type = {{envs.vfile_type | repr}} # pyright: ignore
|
|
19
|
+
chr = {{envs.chr | repr}} # pyright: ignore
|
|
20
|
+
not_chr = {{envs.not_chr | repr}} # pyright: ignore
|
|
21
|
+
autosome = {{envs.autosome | repr}} # pyright: ignore
|
|
22
|
+
autosome_xy = {{envs.autosome_xy | repr}} # pyright: ignore
|
|
23
|
+
snps_only = {{envs.snps_only | repr}} # pyright: ignore
|
|
24
|
+
|
|
25
|
+
samples_file = samples_file or e_samples_file
|
|
26
|
+
if not samples_file and samples:
|
|
27
|
+
samples_file = Path(outdir) / "_samples.txt"
|
|
28
|
+
if isinstance(samples, str):
|
|
29
|
+
samples = [s.strip() for s in samples.split(",")]
|
|
30
|
+
|
|
31
|
+
with open(samples_file, "w") as fh:
|
|
32
|
+
fh.writelines(
|
|
33
|
+
[
|
|
34
|
+
line.replace("/", "\t") + "\n"
|
|
35
|
+
if "/" in line
|
|
36
|
+
else line + "\t" + line + "\n"
|
|
37
|
+
for line in samples
|
|
38
|
+
]
|
|
39
|
+
)
|
|
40
|
+
|
|
41
|
+
variants_file = variants_file or e_variants_file
|
|
42
|
+
if not variants_file and variants:
|
|
43
|
+
if vfile_type != "id":
|
|
44
|
+
logger.warning(
|
|
45
|
+
"envs.vfile_type should be 'id' if only envs.variants is provided."
|
|
46
|
+
)
|
|
47
|
+
vfile_type = "id"
|
|
48
|
+
|
|
49
|
+
variants_file = Path(outdir) / "_variants.txt"
|
|
50
|
+
if isinstance(variants, str):
|
|
51
|
+
variants = [v.strip() for v in variants.split(",")]
|
|
52
|
+
|
|
53
|
+
with open(variants_file, "w") as fh:
|
|
54
|
+
fh.writelines([line + "\n" for line in variants])
|
|
55
|
+
|
|
56
|
+
bedfile = list(Path(indir).glob("*.bed"))
|
|
57
|
+
if len(bedfile) == 0:
|
|
58
|
+
raise FileNotFoundError(f"No .bed file found in `in.indir`")
|
|
59
|
+
elif len(bedfile) > 1:
|
|
60
|
+
logger.warning(f"Multiple .bed files found in `in.indir`, using the first one.")
|
|
61
|
+
|
|
62
|
+
bedfile = bedfile[0]
|
|
63
|
+
input = bedfile.with_suffix("")
|
|
64
|
+
output = Path(outdir) / bedfile.stem
|
|
65
|
+
|
|
66
|
+
args = {
|
|
67
|
+
"": [plink],
|
|
68
|
+
"bfile": input,
|
|
69
|
+
"out": output,
|
|
70
|
+
"threads": ncores,
|
|
71
|
+
"make-bed": True,
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
if keep:
|
|
75
|
+
if samples_file:
|
|
76
|
+
args["keep"] = samples_file
|
|
77
|
+
if variants_file:
|
|
78
|
+
args["extract"] = (
|
|
79
|
+
variants_file if vfile_type == "id" else [vfile_type, variants_file]
|
|
80
|
+
)
|
|
81
|
+
else:
|
|
82
|
+
if samples_file:
|
|
83
|
+
args["remove"] = samples_file
|
|
84
|
+
if variants_file:
|
|
85
|
+
args["exclude"] = (
|
|
86
|
+
variants_file if vfile_type == "id" else [vfile_type, variants_file]
|
|
87
|
+
)
|
|
88
|
+
|
|
89
|
+
if chr:
|
|
90
|
+
args["chr"] = chr
|
|
91
|
+
if not_chr:
|
|
92
|
+
args["not_chr"] = not_chr
|
|
93
|
+
if autosome:
|
|
94
|
+
args["autosome"] = True
|
|
95
|
+
if autosome_xy:
|
|
96
|
+
args["autosome"] = True
|
|
97
|
+
if snps_only:
|
|
98
|
+
args["snps_only"] = snps_only
|
|
99
|
+
|
|
100
|
+
run_command(dict_to_cli_args(args, dashify=True, dup_key=False), fg=True)
|