biopipen 0.21.0__py3-none-any.whl → 0.34.26__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- biopipen/__init__.py +1 -1
- biopipen/core/config.toml +28 -0
- biopipen/core/filters.py +79 -4
- biopipen/core/proc.py +12 -3
- biopipen/core/testing.py +75 -3
- biopipen/ns/bam.py +148 -6
- biopipen/ns/bed.py +75 -0
- biopipen/ns/cellranger.py +186 -0
- biopipen/ns/cellranger_pipeline.py +126 -0
- biopipen/ns/cnv.py +19 -3
- biopipen/ns/cnvkit.py +1 -1
- biopipen/ns/cnvkit_pipeline.py +20 -12
- biopipen/ns/delim.py +34 -35
- biopipen/ns/gene.py +68 -23
- biopipen/ns/gsea.py +63 -37
- biopipen/ns/misc.py +39 -14
- biopipen/ns/plot.py +304 -1
- biopipen/ns/protein.py +183 -0
- biopipen/ns/regulatory.py +290 -0
- biopipen/ns/rnaseq.py +142 -5
- biopipen/ns/scrna.py +2053 -473
- biopipen/ns/scrna_metabolic_landscape.py +228 -382
- biopipen/ns/snp.py +659 -0
- biopipen/ns/stats.py +484 -0
- biopipen/ns/tcr.py +683 -98
- biopipen/ns/vcf.py +236 -2
- biopipen/ns/web.py +97 -6
- biopipen/reports/bam/CNVpytor.svelte +4 -9
- biopipen/reports/cellranger/CellRangerCount.svelte +18 -0
- biopipen/reports/cellranger/CellRangerSummary.svelte +16 -0
- biopipen/reports/cellranger/CellRangerVdj.svelte +18 -0
- biopipen/reports/cnvkit/CNVkitDiagram.svelte +1 -1
- biopipen/reports/cnvkit/CNVkitHeatmap.svelte +1 -1
- biopipen/reports/cnvkit/CNVkitScatter.svelte +1 -1
- biopipen/reports/common.svelte +15 -0
- biopipen/reports/protein/ProdigySummary.svelte +16 -0
- biopipen/reports/scrna/CellsDistribution.svelte +4 -39
- biopipen/reports/scrna/DimPlots.svelte +1 -1
- biopipen/reports/scrna/MarkersFinder.svelte +6 -126
- biopipen/reports/scrna/MetaMarkers.svelte +3 -75
- biopipen/reports/scrna/RadarPlots.svelte +4 -20
- biopipen/reports/scrna_metabolic_landscape/MetabolicFeatures.svelte +61 -22
- biopipen/reports/scrna_metabolic_landscape/MetabolicPathwayActivity.svelte +88 -82
- biopipen/reports/scrna_metabolic_landscape/MetabolicPathwayHeterogeneity.svelte +70 -10
- biopipen/reports/snp/PlinkCallRate.svelte +24 -0
- biopipen/reports/snp/PlinkFreq.svelte +18 -0
- biopipen/reports/snp/PlinkHWE.svelte +18 -0
- biopipen/reports/snp/PlinkHet.svelte +18 -0
- biopipen/reports/snp/PlinkIBD.svelte +18 -0
- biopipen/reports/tcr/CDR3AAPhyschem.svelte +19 -66
- biopipen/reports/tcr/ClonalStats.svelte +16 -0
- biopipen/reports/tcr/CloneResidency.svelte +3 -93
- biopipen/reports/tcr/Immunarch.svelte +4 -155
- biopipen/reports/tcr/TCRClusterStats.svelte +3 -45
- biopipen/reports/tcr/TESSA.svelte +11 -28
- biopipen/reports/utils/misc.liq +22 -7
- biopipen/scripts/bam/BamMerge.py +11 -15
- biopipen/scripts/bam/BamSampling.py +90 -0
- biopipen/scripts/bam/BamSort.py +141 -0
- biopipen/scripts/bam/BamSplitChroms.py +10 -10
- biopipen/scripts/bam/BamSubsetByBed.py +38 -0
- biopipen/scripts/bam/CNAClinic.R +41 -5
- biopipen/scripts/bam/CNVpytor.py +153 -54
- biopipen/scripts/bam/ControlFREEC.py +13 -14
- biopipen/scripts/bam/SamtoolsView.py +33 -0
- biopipen/scripts/bed/Bed2Vcf.py +5 -5
- biopipen/scripts/bed/BedConsensus.py +5 -5
- biopipen/scripts/bed/BedLiftOver.sh +6 -4
- biopipen/scripts/bed/BedtoolsIntersect.py +54 -0
- biopipen/scripts/bed/BedtoolsMakeWindows.py +47 -0
- biopipen/scripts/bed/BedtoolsMerge.py +4 -4
- biopipen/scripts/cellranger/CellRangerCount.py +138 -0
- biopipen/scripts/cellranger/CellRangerSummary.R +181 -0
- biopipen/scripts/cellranger/CellRangerVdj.py +112 -0
- biopipen/scripts/cnv/AneuploidyScore.R +55 -20
- biopipen/scripts/cnv/AneuploidyScoreSummary.R +221 -163
- biopipen/scripts/cnv/TMADScore.R +25 -9
- biopipen/scripts/cnv/TMADScoreSummary.R +57 -86
- biopipen/scripts/cnvkit/CNVkitAccess.py +7 -6
- biopipen/scripts/cnvkit/CNVkitAutobin.py +26 -18
- biopipen/scripts/cnvkit/CNVkitBatch.py +6 -6
- biopipen/scripts/cnvkit/CNVkitCall.py +3 -3
- biopipen/scripts/cnvkit/CNVkitCoverage.py +4 -3
- biopipen/scripts/cnvkit/CNVkitDiagram.py +5 -5
- biopipen/scripts/cnvkit/CNVkitFix.py +3 -3
- biopipen/scripts/cnvkit/CNVkitGuessBaits.py +12 -8
- biopipen/scripts/cnvkit/CNVkitHeatmap.py +5 -5
- biopipen/scripts/cnvkit/CNVkitReference.py +6 -5
- biopipen/scripts/cnvkit/CNVkitScatter.py +5 -5
- biopipen/scripts/cnvkit/CNVkitSegment.py +5 -5
- biopipen/scripts/cnvkit/guess_baits.py +166 -93
- biopipen/scripts/delim/RowsBinder.R +1 -1
- biopipen/scripts/delim/SampleInfo.R +116 -118
- biopipen/scripts/gene/GeneNameConversion.R +67 -0
- biopipen/scripts/gene/GenePromoters.R +61 -0
- biopipen/scripts/gsea/Enrichr.R +5 -5
- biopipen/scripts/gsea/FGSEA.R +184 -50
- biopipen/scripts/gsea/GSEA.R +2 -2
- biopipen/scripts/gsea/PreRank.R +5 -5
- biopipen/scripts/misc/Config2File.py +2 -2
- biopipen/scripts/misc/Plot.R +80 -0
- biopipen/scripts/misc/Shell.sh +15 -0
- biopipen/scripts/misc/Str2File.py +2 -2
- biopipen/scripts/plot/Heatmap.R +3 -3
- biopipen/scripts/plot/Manhattan.R +147 -0
- biopipen/scripts/plot/QQPlot.R +146 -0
- biopipen/scripts/plot/ROC.R +88 -0
- biopipen/scripts/plot/Scatter.R +112 -0
- biopipen/scripts/plot/VennDiagram.R +5 -9
- biopipen/scripts/protein/MMCIF2PDB.py +33 -0
- biopipen/scripts/protein/PDB2Fasta.py +60 -0
- biopipen/scripts/protein/Prodigy.py +119 -0
- biopipen/scripts/protein/ProdigySummary.R +140 -0
- biopipen/scripts/protein/RMSD.py +178 -0
- biopipen/scripts/regulatory/MotifAffinityTest.R +102 -0
- biopipen/scripts/regulatory/MotifAffinityTest_AtSNP.R +127 -0
- biopipen/scripts/regulatory/MotifAffinityTest_MotifBreakR.R +104 -0
- biopipen/scripts/regulatory/MotifScan.py +159 -0
- biopipen/scripts/regulatory/VariantMotifPlot.R +78 -0
- biopipen/scripts/regulatory/motifs-common.R +324 -0
- biopipen/scripts/rnaseq/Simulation-ESCO.R +180 -0
- biopipen/scripts/rnaseq/Simulation-RUVcorr.R +45 -0
- biopipen/scripts/rnaseq/Simulation.R +21 -0
- biopipen/scripts/rnaseq/UnitConversion.R +325 -54
- biopipen/scripts/scrna/AnnData2Seurat.R +40 -0
- biopipen/scripts/scrna/CCPlotR-patch.R +161 -0
- biopipen/scripts/scrna/CellCellCommunication.py +150 -0
- biopipen/scripts/scrna/CellCellCommunicationPlots.R +93 -0
- biopipen/scripts/scrna/CellSNPLite.py +30 -0
- biopipen/scripts/scrna/CellTypeAnnotation-celltypist.R +185 -0
- biopipen/scripts/scrna/CellTypeAnnotation-direct.R +68 -31
- biopipen/scripts/scrna/CellTypeAnnotation-hitype.R +27 -22
- biopipen/scripts/scrna/CellTypeAnnotation-sccatch.R +28 -20
- biopipen/scripts/scrna/CellTypeAnnotation-sctype.R +48 -25
- biopipen/scripts/scrna/CellTypeAnnotation.R +37 -1
- biopipen/scripts/scrna/CellsDistribution.R +456 -167
- biopipen/scripts/scrna/DimPlots.R +1 -1
- biopipen/scripts/scrna/ExprImputation-alra.R +109 -0
- biopipen/scripts/scrna/ExprImputation-rmagic.R +256 -0
- biopipen/scripts/scrna/{ExprImpution-scimpute.R → ExprImputation-scimpute.R} +8 -5
- biopipen/scripts/scrna/ExprImputation.R +7 -0
- biopipen/scripts/scrna/LoomTo10X.R +51 -0
- biopipen/scripts/scrna/MQuad.py +25 -0
- biopipen/scripts/scrna/MarkersFinder.R +679 -400
- biopipen/scripts/scrna/MetaMarkers.R +265 -161
- biopipen/scripts/scrna/ModuleScoreCalculator.R +66 -11
- biopipen/scripts/scrna/PseudoBulkDEG.R +678 -0
- biopipen/scripts/scrna/RadarPlots.R +355 -134
- biopipen/scripts/scrna/ScFGSEA.R +298 -100
- biopipen/scripts/scrna/ScSimulation.R +65 -0
- biopipen/scripts/scrna/ScVelo.py +617 -0
- biopipen/scripts/scrna/Seurat2AnnData.R +7 -0
- biopipen/scripts/scrna/SeuratClusterStats-clustree.R +87 -0
- biopipen/scripts/scrna/SeuratClusterStats-dimplots.R +36 -30
- biopipen/scripts/scrna/SeuratClusterStats-features.R +138 -187
- biopipen/scripts/scrna/SeuratClusterStats-ngenes.R +81 -0
- biopipen/scripts/scrna/SeuratClusterStats-stats.R +78 -89
- biopipen/scripts/scrna/SeuratClusterStats.R +47 -10
- biopipen/scripts/scrna/SeuratClustering.R +36 -233
- biopipen/scripts/scrna/SeuratLoading.R +2 -2
- biopipen/scripts/scrna/SeuratMap2Ref.R +84 -113
- biopipen/scripts/scrna/SeuratMetadataMutater.R +16 -6
- biopipen/scripts/scrna/SeuratPreparing.R +223 -173
- biopipen/scripts/scrna/SeuratSubClustering.R +64 -0
- biopipen/scripts/scrna/SeuratTo10X.R +27 -0
- biopipen/scripts/scrna/Slingshot.R +65 -0
- biopipen/scripts/scrna/Subset10X.R +2 -2
- biopipen/scripts/scrna/TopExpressingGenes.R +169 -135
- biopipen/scripts/scrna/celltypist-wrapper.py +195 -0
- biopipen/scripts/scrna/scvelo_paga.py +313 -0
- biopipen/scripts/scrna/seurat_anndata_conversion.py +98 -0
- biopipen/scripts/scrna_metabolic_landscape/MetabolicFeatures.R +447 -82
- biopipen/scripts/scrna_metabolic_landscape/MetabolicPathwayActivity.R +348 -241
- biopipen/scripts/scrna_metabolic_landscape/MetabolicPathwayHeterogeneity.R +188 -166
- biopipen/scripts/snp/MatrixEQTL.R +217 -0
- biopipen/scripts/snp/Plink2GTMat.py +148 -0
- biopipen/scripts/snp/PlinkCallRate.R +199 -0
- biopipen/scripts/snp/PlinkFilter.py +100 -0
- biopipen/scripts/snp/PlinkFreq.R +291 -0
- biopipen/scripts/snp/PlinkFromVcf.py +81 -0
- biopipen/scripts/snp/PlinkHWE.R +85 -0
- biopipen/scripts/snp/PlinkHet.R +96 -0
- biopipen/scripts/snp/PlinkIBD.R +196 -0
- biopipen/scripts/snp/PlinkSimulation.py +124 -0
- biopipen/scripts/snp/PlinkUpdateName.py +124 -0
- biopipen/scripts/stats/ChowTest.R +146 -0
- biopipen/scripts/stats/DiffCoexpr.R +152 -0
- biopipen/scripts/stats/LiquidAssoc.R +135 -0
- biopipen/scripts/stats/Mediation.R +108 -0
- biopipen/scripts/stats/MetaPvalue.R +130 -0
- biopipen/scripts/stats/MetaPvalue1.R +74 -0
- biopipen/scripts/tcgamaf/Maf2Vcf.py +2 -2
- biopipen/scripts/tcgamaf/MafAddChr.py +2 -2
- biopipen/scripts/tcr/Attach2Seurat.R +3 -2
- biopipen/scripts/tcr/CDR3AAPhyschem.R +211 -143
- biopipen/scripts/tcr/CDR3Clustering.R +343 -0
- biopipen/scripts/tcr/ClonalStats.R +526 -0
- biopipen/scripts/tcr/CloneResidency.R +255 -131
- biopipen/scripts/tcr/CloneSizeQQPlot.R +4 -4
- biopipen/scripts/tcr/GIANA/GIANA.py +1356 -797
- biopipen/scripts/tcr/GIANA/GIANA4.py +1362 -789
- biopipen/scripts/tcr/GIANA/query.py +164 -162
- biopipen/scripts/tcr/Immunarch-basic.R +31 -9
- biopipen/scripts/tcr/Immunarch-clonality.R +25 -5
- biopipen/scripts/tcr/Immunarch-diversity.R +352 -134
- biopipen/scripts/tcr/Immunarch-geneusage.R +45 -5
- biopipen/scripts/tcr/Immunarch-kmer.R +68 -8
- biopipen/scripts/tcr/Immunarch-overlap.R +84 -4
- biopipen/scripts/tcr/Immunarch-spectratyping.R +35 -6
- biopipen/scripts/tcr/Immunarch-tracking.R +38 -6
- biopipen/scripts/tcr/Immunarch-vjjunc.R +165 -0
- biopipen/scripts/tcr/Immunarch.R +63 -11
- biopipen/scripts/tcr/Immunarch2VDJtools.R +2 -2
- biopipen/scripts/tcr/ImmunarchFilter.R +4 -4
- biopipen/scripts/tcr/ImmunarchLoading.R +38 -29
- biopipen/scripts/tcr/SampleDiversity.R +1 -1
- biopipen/scripts/tcr/ScRepCombiningExpression.R +40 -0
- biopipen/scripts/tcr/ScRepLoading.R +166 -0
- biopipen/scripts/tcr/TCRClusterStats.R +176 -22
- biopipen/scripts/tcr/TCRDock.py +110 -0
- biopipen/scripts/tcr/TESSA.R +102 -118
- biopipen/scripts/tcr/VJUsage.R +5 -5
- biopipen/scripts/tcr/immunarch-patched.R +142 -0
- biopipen/scripts/tcr/vdjtools-patch.sh +1 -1
- biopipen/scripts/vcf/BcftoolsAnnotate.py +91 -0
- biopipen/scripts/vcf/BcftoolsFilter.py +90 -0
- biopipen/scripts/vcf/BcftoolsMerge.py +31 -0
- biopipen/scripts/vcf/BcftoolsSort.py +113 -0
- biopipen/scripts/vcf/BcftoolsView.py +73 -0
- biopipen/scripts/vcf/TruvariBench.sh +14 -7
- biopipen/scripts/vcf/TruvariBenchSummary.R +16 -13
- biopipen/scripts/vcf/TruvariConsistency.R +1 -1
- biopipen/scripts/vcf/Vcf2Bed.py +2 -2
- biopipen/scripts/vcf/VcfAnno.py +11 -11
- biopipen/scripts/vcf/VcfDownSample.sh +22 -10
- biopipen/scripts/vcf/VcfFilter.py +5 -5
- biopipen/scripts/vcf/VcfFix.py +7 -7
- biopipen/scripts/vcf/VcfFix_utils.py +13 -4
- biopipen/scripts/vcf/VcfIndex.py +3 -3
- biopipen/scripts/vcf/VcfIntersect.py +3 -3
- biopipen/scripts/vcf/VcfLiftOver.sh +5 -0
- biopipen/scripts/vcf/VcfSplitSamples.py +4 -4
- biopipen/scripts/vcf/bcftools_utils.py +52 -0
- biopipen/scripts/web/Download.py +8 -4
- biopipen/scripts/web/DownloadList.py +5 -5
- biopipen/scripts/web/GCloudStorageDownloadBucket.py +82 -0
- biopipen/scripts/web/GCloudStorageDownloadFile.py +23 -0
- biopipen/scripts/web/gcloud_common.py +49 -0
- biopipen/utils/gene.py +108 -60
- biopipen/utils/misc.py +146 -20
- biopipen/utils/reference.py +64 -20
- biopipen/utils/reporter.py +177 -0
- biopipen/utils/vcf.py +1 -1
- biopipen-0.34.26.dist-info/METADATA +27 -0
- biopipen-0.34.26.dist-info/RECORD +292 -0
- {biopipen-0.21.0.dist-info → biopipen-0.34.26.dist-info}/WHEEL +1 -1
- {biopipen-0.21.0.dist-info → biopipen-0.34.26.dist-info}/entry_points.txt +6 -2
- biopipen/ns/bcftools.py +0 -111
- biopipen/ns/scrna_basic.py +0 -255
- biopipen/reports/delim/SampleInfo.svelte +0 -36
- biopipen/reports/scrna/GeneExpressionInvistigation.svelte +0 -32
- biopipen/reports/scrna/ScFGSEA.svelte +0 -35
- biopipen/reports/scrna/SeuratClusterStats.svelte +0 -82
- biopipen/reports/scrna/SeuratMap2Ref.svelte +0 -20
- biopipen/reports/scrna/SeuratPreparing.svelte +0 -38
- biopipen/reports/scrna/TopExpressingGenes.svelte +0 -55
- biopipen/reports/scrna_metabolic_landscape/MetabolicFeaturesIntraSubset.svelte +0 -31
- biopipen/reports/utils/gsea.liq +0 -110
- biopipen/scripts/bcftools/BcftoolsAnnotate.py +0 -42
- biopipen/scripts/bcftools/BcftoolsFilter.py +0 -79
- biopipen/scripts/bcftools/BcftoolsSort.py +0 -19
- biopipen/scripts/gene/GeneNameConversion.py +0 -66
- biopipen/scripts/scrna/ExprImpution-alra.R +0 -32
- biopipen/scripts/scrna/ExprImpution-rmagic.R +0 -29
- biopipen/scripts/scrna/ExprImpution.R +0 -7
- biopipen/scripts/scrna/GeneExpressionInvistigation.R +0 -132
- biopipen/scripts/scrna/Write10X.R +0 -11
- biopipen/scripts/scrna_metabolic_landscape/MetabolicFeaturesIntraSubset.R +0 -150
- biopipen/scripts/tcr/TCRClustering.R +0 -280
- biopipen/utils/common_docstrs.py +0 -61
- biopipen/utils/gene.R +0 -49
- biopipen/utils/gsea.R +0 -193
- biopipen/utils/io.R +0 -20
- biopipen/utils/misc.R +0 -114
- biopipen/utils/mutate_helpers.R +0 -433
- biopipen/utils/plot.R +0 -173
- biopipen/utils/rnaseq.R +0 -48
- biopipen/utils/single_cell.R +0 -115
- biopipen-0.21.0.dist-info/METADATA +0 -22
- biopipen-0.21.0.dist-info/RECORD +0 -218
biopipen/ns/snp.py
ADDED
|
@@ -0,0 +1,659 @@
|
|
|
1
|
+
"""Plink processes"""
|
|
2
|
+
|
|
3
|
+
from ..core.proc import Proc
|
|
4
|
+
from ..core.config import config
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class PlinkSimulation(Proc):
|
|
8
|
+
"""Simulate SNPs using PLINK v2
|
|
9
|
+
|
|
10
|
+
See also <https://www.cog-genomics.org/plink/2.0/input#simulate> and
|
|
11
|
+
<https://pwwang.github.io/biopipen/api/biopipen.ns.snp/#biopipen.ns.snp.PlinkSimulation>
|
|
12
|
+
|
|
13
|
+
Input:
|
|
14
|
+
configfile: Configuration file containing the parameters for the simulation.
|
|
15
|
+
The configuration file (in toml, yaml or json format) should contain a
|
|
16
|
+
dictionary of parameters. The parameters are listed in `envs` except
|
|
17
|
+
`ncores`, which is used for parallelization. You can set parameters
|
|
18
|
+
in `envs` and override them in the configuration file.
|
|
19
|
+
|
|
20
|
+
Output:
|
|
21
|
+
outdir: Output directory containing the simulated data
|
|
22
|
+
`plink_sim.bed`, `plink_sim.bim`, and `plink_sim.fam` will be generated.
|
|
23
|
+
gtmat: Genotype matrix file containing the simulated data with rows representing
|
|
24
|
+
SNPs and columns representing samples.
|
|
25
|
+
|
|
26
|
+
Envs:
|
|
27
|
+
nsnps (type=int): Number of SNPs to simulate
|
|
28
|
+
ncases (type=int): Number of cases to simulate
|
|
29
|
+
nctrls (type=int): Number of controls to simulate
|
|
30
|
+
plink: Path to PLINK v2
|
|
31
|
+
seed (type=int): Random seed. If not set, seed will not be set.
|
|
32
|
+
label: Prefix label for the SNPs.
|
|
33
|
+
prevalence (type=float): Disease prevalence.
|
|
34
|
+
minfreq (type=float): Minimum allele frequency.
|
|
35
|
+
maxfreq (type=float): Maximum allele frequency.
|
|
36
|
+
hetodds (type=float): Odds ratio for heterozygous genotypes.
|
|
37
|
+
homodds (type=float): Odds ratio for homozygous genotypes.
|
|
38
|
+
missing (type=float): Proportion of missing genotypes.
|
|
39
|
+
args (ns): Additional arguments to pass to PLINK.
|
|
40
|
+
- <more>: see <https://www.cog-genomics.org/plink/2.0/input#simulate>.
|
|
41
|
+
transpose_gtmat (flag): If set, the genotype matrix (`out.gtmat`) will
|
|
42
|
+
be transposed.
|
|
43
|
+
sample_prefix: Use this prefix for the sample names. If not set, the sample
|
|
44
|
+
names will be `per0_per0`, `per1_per1`, `per2_per2`, etc. If set, the
|
|
45
|
+
sample names will be `prefix0`, `prefix1`, `prefix2`, etc.
|
|
46
|
+
This only affects the sample names in the genotype matrix file
|
|
47
|
+
(`out.gtmat`).
|
|
48
|
+
"""
|
|
49
|
+
input = "configfile:file"
|
|
50
|
+
output = [
|
|
51
|
+
"outdir:dir:{{in.configfile | stem}}.plink_sim",
|
|
52
|
+
"gtmat:file:{{in.configfile | stem}}.plink_sim/"
|
|
53
|
+
"{{in.configfile | stem}}-gtmat.txt",
|
|
54
|
+
]
|
|
55
|
+
lang = config.lang.python
|
|
56
|
+
envs = {
|
|
57
|
+
"nsnps": None,
|
|
58
|
+
"ncases": None,
|
|
59
|
+
"nctrls": None,
|
|
60
|
+
"plink": config.exe.plink,
|
|
61
|
+
"seed": None,
|
|
62
|
+
"label": "SNP",
|
|
63
|
+
"prevalence": 0.01,
|
|
64
|
+
"minfreq": 0.0,
|
|
65
|
+
"maxfreq": 1.0,
|
|
66
|
+
"hetodds": 1.0,
|
|
67
|
+
"homodds": 1.0,
|
|
68
|
+
"missing": 0.0,
|
|
69
|
+
"args": {},
|
|
70
|
+
"transpose_gtmat": False,
|
|
71
|
+
"sample_prefix": None,
|
|
72
|
+
}
|
|
73
|
+
script = "file://../scripts/snp/PlinkSimulation.py"
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
class MatrixEQTL(Proc):
|
|
77
|
+
"""Run Matrix eQTL
|
|
78
|
+
|
|
79
|
+
See also <https://www.bios.unc.edu/research/genomic_software/Matrix_eQTL/>
|
|
80
|
+
|
|
81
|
+
Input:
|
|
82
|
+
geno: Genotype matrix file with rows representing SNPs and columns
|
|
83
|
+
representing samples.
|
|
84
|
+
expr: Expression matrix file with rows representing genes and columns
|
|
85
|
+
representing samples.
|
|
86
|
+
cov: Covariate matrix file with rows representing covariates and columns
|
|
87
|
+
representing samples.
|
|
88
|
+
|
|
89
|
+
Output:
|
|
90
|
+
alleqtls: Matrix eQTL output file
|
|
91
|
+
cisqtls: The cis-eQTL file if `snppos` and `genepos` are provided.
|
|
92
|
+
Otherwise it'll be empty.
|
|
93
|
+
|
|
94
|
+
Envs:
|
|
95
|
+
model (choice): The model to use.
|
|
96
|
+
- linear: Linear model
|
|
97
|
+
- modelLINEAR: Same as `linear`
|
|
98
|
+
- anova: ANOVA model
|
|
99
|
+
- modelANOVA: Same as `anova`
|
|
100
|
+
pval (type=float): P-value threshold for eQTLs
|
|
101
|
+
match_samples (flag): Match samples in the genotype and expression matrices.
|
|
102
|
+
If True, an error will be raised if samples from `in.geno`, `in.expr`,
|
|
103
|
+
and `in.cov` (if provided) are not the same.
|
|
104
|
+
If False, common samples will be used to subset the matrices.
|
|
105
|
+
transp (type=float): P-value threshold for trans-eQTLs.
|
|
106
|
+
If cis-eQTLs are not enabled (`snppos` and `genepos` are not set),
|
|
107
|
+
this defaults to 1e-5.
|
|
108
|
+
If cis-eQTLs are enabled, this defaults to `None`, which will disable
|
|
109
|
+
trans-eQTL analysis.
|
|
110
|
+
fdr (flag): Do FDR calculation or not (save memory if not).
|
|
111
|
+
snppos: The path of the SNP position file.
|
|
112
|
+
It could be a BED, GFF, VCF or a tab-delimited file with
|
|
113
|
+
`snp`, `chr`, `pos` as the first 3 columns.
|
|
114
|
+
genepos: The path of the gene position file.
|
|
115
|
+
It could be a BED or GFF file.
|
|
116
|
+
dist (type=int): Distance threshold for cis-eQTLs.
|
|
117
|
+
transpose_geno (flag): If set, the genotype matrix (`in.geno`)
|
|
118
|
+
will be transposed.
|
|
119
|
+
transpose_expr (flag): If set, the expression matrix (`in.expr`)
|
|
120
|
+
will be transposed.
|
|
121
|
+
transpose_cov (flag): If set, the covariate matrix (`in.cov`)
|
|
122
|
+
will be transposed.
|
|
123
|
+
"""
|
|
124
|
+
input = "geno:file, expr:file, cov:file"
|
|
125
|
+
output = [
|
|
126
|
+
"alleqtls:file:{{in.geno | stem}}.alleqtls.txt",
|
|
127
|
+
"cisqtls:file:{{in.geno | stem}}.cisqtls.txt",
|
|
128
|
+
]
|
|
129
|
+
lang = config.lang.rscript
|
|
130
|
+
envs = {
|
|
131
|
+
"model": "linear",
|
|
132
|
+
"pval": 1e-3,
|
|
133
|
+
"match_samples": False,
|
|
134
|
+
"transp": None,
|
|
135
|
+
"fdr": False,
|
|
136
|
+
"snppos": None,
|
|
137
|
+
"genepos": config.ref.refgene,
|
|
138
|
+
"dist": 250000,
|
|
139
|
+
"transpose_geno": False,
|
|
140
|
+
"transpose_expr": False,
|
|
141
|
+
"transpose_cov": False,
|
|
142
|
+
}
|
|
143
|
+
script = "file://../scripts/snp/MatrixEQTL.R"
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
class PlinkFromVcf(Proc):
|
|
147
|
+
"""Convert VCF to PLINK format.
|
|
148
|
+
|
|
149
|
+
The PLINK format consists of 3 files: `.bed`, `.bim`, and `.fam`.
|
|
150
|
+
|
|
151
|
+
Requires PLINK v2
|
|
152
|
+
|
|
153
|
+
TODO:
|
|
154
|
+
Handle sex when sex chromosomes are included.
|
|
155
|
+
|
|
156
|
+
Input:
|
|
157
|
+
invcf: VCF file
|
|
158
|
+
|
|
159
|
+
Output:
|
|
160
|
+
outdir: Output directory containing the PLINK files
|
|
161
|
+
|
|
162
|
+
Envs:
|
|
163
|
+
plink: Path to PLINK v2
|
|
164
|
+
tabix: Path to tabix
|
|
165
|
+
ncores (type=int): Number of cores/threads to use, will pass to plink
|
|
166
|
+
`--threads` option
|
|
167
|
+
vcf_half_call (choice): The current VCF standard does not specify
|
|
168
|
+
how '0/.' and similar GT values should be interpreted.
|
|
169
|
+
- error: error out and reports the line number of the anomaly
|
|
170
|
+
- e: alias for `error`
|
|
171
|
+
- haploid: treat half-calls as haploid/homozygous
|
|
172
|
+
- h: alias for `haploid`
|
|
173
|
+
- missing: treat half-calls as missing
|
|
174
|
+
- m: alias for `missing`
|
|
175
|
+
- reference: treat the missing part as reference
|
|
176
|
+
- r: alias for `reference`
|
|
177
|
+
double_id (flag): set both FIDs and IIDs to the VCF/BCF sample ID.
|
|
178
|
+
vcf_filter (auto): skip variants which failed one or more filters tracked
|
|
179
|
+
by the FILTER field.
|
|
180
|
+
If True, only FILTER with `PASS` or `.` will be kept.
|
|
181
|
+
Multiple filters can be specified by separating them with space or
|
|
182
|
+
as a list.
|
|
183
|
+
vcf_idspace_to: convert all spaces in sample IDs to this character.
|
|
184
|
+
set_missing_var_ids: update variant IDs using a template string,
|
|
185
|
+
with a '@' where the chromosome code should go, and a '#' where the
|
|
186
|
+
base-pair position belongs. You can also specify `\\$r` and `\\$a` for
|
|
187
|
+
the reference and alternate alleles, respectively.
|
|
188
|
+
See <https://www.cog-genomics.org/plink/2.0/data#set_all_var_ids>
|
|
189
|
+
max_alleles (type=int): Maximum number of alleles per variant.
|
|
190
|
+
<more>: see <https://www.cog-genomics.org/plink/2.0/> for more options.
|
|
191
|
+
Note that `_` will be replaced by `-` in the argument names.
|
|
192
|
+
""" # noqa: E501
|
|
193
|
+
input = "invcf:file"
|
|
194
|
+
output = "outdir:dir:{{in.invcf.stem | regex_replace: '\\.gz$', ''}}"
|
|
195
|
+
lang = config.lang.python
|
|
196
|
+
envs = {
|
|
197
|
+
"plink": config.exe.plink2,
|
|
198
|
+
"tabix": config.exe.tabix,
|
|
199
|
+
"ncores": config.misc.ncores,
|
|
200
|
+
"vcf_half_call": "missing",
|
|
201
|
+
"double_id": True,
|
|
202
|
+
"vcf_filter": True,
|
|
203
|
+
"vcf_idspace_to": "_",
|
|
204
|
+
"set_missing_var_ids": "@_#",
|
|
205
|
+
"max_alleles": 2,
|
|
206
|
+
}
|
|
207
|
+
script = "file://../scripts/snp/PlinkFromVcf.py"
|
|
208
|
+
|
|
209
|
+
|
|
210
|
+
class Plink2GTMat(Proc):
|
|
211
|
+
"""Convert PLINK files to genotype matrix.
|
|
212
|
+
|
|
213
|
+
Requires PLINK v2. The .raw/.traw file is generated by plink and then transformed
|
|
214
|
+
to a genotype matrix file.
|
|
215
|
+
See <https://www.cog-genomics.org/plink/2.0/formats#raw> and
|
|
216
|
+
<https://www.cog-genomics.org/plink/2.0/formats#traw> for more information.
|
|
217
|
+
|
|
218
|
+
The allelic dosage is used as the values of genotype matrix.
|
|
219
|
+
"--keep-allele-order" is used to keep the allele order consistent with the
|
|
220
|
+
reference allele first. This way, the genotype of homozygous reference alleles
|
|
221
|
+
will be encoded as 2, heterozygous as 1, and homozygous alternate alleles as 0.
|
|
222
|
+
This is the PLINK dosage encoding. If you want to use this encoding, you can
|
|
223
|
+
set `envs.gtcoding` to `plink`. Otherwise, the default encoding is `vcf`, which
|
|
224
|
+
will encode the genotype as 0, 1, and 2 for homozygous reference, heterozygous,
|
|
225
|
+
and homozygous alternate alleles, respectively.
|
|
226
|
+
|
|
227
|
+
Note that `envs.gtcoding = "vcf"` only works for biallelic variants for now.
|
|
228
|
+
|
|
229
|
+
Input:
|
|
230
|
+
indir: Input directory containing the PLINK files.
|
|
231
|
+
Including `.bed`, `.bim`, and `.fam` files
|
|
232
|
+
|
|
233
|
+
Output:
|
|
234
|
+
outfile: Genotype matrix file with rows representing SNPs and columns
|
|
235
|
+
representing samples if `envs.transpose` is `False`.
|
|
236
|
+
|
|
237
|
+
Envs:
|
|
238
|
+
plink: Path to PLINK v2.0
|
|
239
|
+
ncores (type=int): Number of cores/threads to use, will pass to plink
|
|
240
|
+
`--threads` option
|
|
241
|
+
transpose (flag): If set, the genotype matrix (`out.outfile`) is transposed.
|
|
242
|
+
samid: what to use as sample ID.
|
|
243
|
+
Placeholders include `{fid}` and `{iid}` for family and individual IDs,
|
|
244
|
+
respectively.
|
|
245
|
+
varid: what to use as variant ID.
|
|
246
|
+
Placeholders include `{chr}`, `{pos}`, `{rs}`, `{ref}`, and `{alt}` for
|
|
247
|
+
chromosome, position, rsID, reference allele, and alternate allele,
|
|
248
|
+
respectively.
|
|
249
|
+
trans_chr: A dictionary to translate chromosome numbers to chromosome names.
|
|
250
|
+
missing_id: what to use as the rs if missing.
|
|
251
|
+
gtcoding (choice): The genotype coding to use.
|
|
252
|
+
- vcf: 0/1/2 for homozygous reference, heterozygous, and homozygous
|
|
253
|
+
alternate alleles, respectively.
|
|
254
|
+
- plink: 2/1/0 for homozygous reference, heterozygous, and homozygous
|
|
255
|
+
alternate alleles, respectively.
|
|
256
|
+
"""
|
|
257
|
+
input = "indir:dir"
|
|
258
|
+
output = "outfile:file:{{in.indir | stem}}-gtmat.txt"
|
|
259
|
+
lang = config.lang.python
|
|
260
|
+
envs = {
|
|
261
|
+
"plink": config.exe.plink2,
|
|
262
|
+
"ncores": config.misc.ncores,
|
|
263
|
+
"transpose": False,
|
|
264
|
+
"samid": "{fid}_{iid}",
|
|
265
|
+
"varid": "{chr}_{pos}_{varid}_{ref}_{alt}",
|
|
266
|
+
"trans_chr": {"23": "X", "24": "Y", "25": "XY", "26": "M"},
|
|
267
|
+
"missing_id": "NA",
|
|
268
|
+
"gtcoding": "vcf",
|
|
269
|
+
}
|
|
270
|
+
script = "file://../scripts/snp/Plink2GTMat.py"
|
|
271
|
+
|
|
272
|
+
|
|
273
|
+
class PlinkIBD(Proc):
|
|
274
|
+
"""Run PLINK IBD analysis (identity by descent)
|
|
275
|
+
|
|
276
|
+
See also <https://www.cog-genomics.org/plink/1.9/ibd>
|
|
277
|
+
This has to run with PLINK v1.9. Plink v2 does not support IBD analysis yet.
|
|
278
|
+
|
|
279
|
+
Input:
|
|
280
|
+
indir: Input directory containing the PLINK files.
|
|
281
|
+
Including `.bed`, `.bim`, and `.fam` files
|
|
282
|
+
|
|
283
|
+
Output:
|
|
284
|
+
outdir: Output file containing the IBD results.
|
|
285
|
+
Including [`.genome`](https://www.cog-genomics.org/plink/2.0/formats#genome)
|
|
286
|
+
file for the original IBD report from PLINK, and `.ibd.png` for the
|
|
287
|
+
heatmap of `PI_HAT` values.
|
|
288
|
+
|
|
289
|
+
Envs:
|
|
290
|
+
plink: Path to PLINK v1.9
|
|
291
|
+
ncores (type=int): Number of cores/threads to use, will pass to plink
|
|
292
|
+
`--threads` option
|
|
293
|
+
highld: High LD regions to be excluded from the analysis.
|
|
294
|
+
If not set, no regions will be excluded.
|
|
295
|
+
samid: what to use as sample ID.
|
|
296
|
+
Placeholders include `{fid}` and `{iid}` for family and individual IDs,
|
|
297
|
+
respectively
|
|
298
|
+
indep (type=auto): LD pruning parameters. Either a list of numerics or a string
|
|
299
|
+
concatenated by `,` to specify
|
|
300
|
+
1) consider a window of N SNPs (e.g. 50),
|
|
301
|
+
2) calculate LD between each pair of SNPs in the window (e.g. 5),
|
|
302
|
+
3) remove one of a pair of SNPs if the LD is greater than X (e.g. 0.2).
|
|
303
|
+
pihat (type=float): PI_HAT threshold for IBD analysis.
|
|
304
|
+
See also <https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5007749/>
|
|
305
|
+
plot (flag): If set, plot the heatmap of `PI_HAT` values.
|
|
306
|
+
anno: The annotation file for the samples, used to plot on the heatmap.
|
|
307
|
+
Names must match the ones that are transformed by `args.samid`.
|
|
308
|
+
seed (type=int): Random seed for the analysis.
|
|
309
|
+
devpars (ns): The device parameters for the plot.
|
|
310
|
+
- width (type=int): Width of the plot
|
|
311
|
+
- height (type=int): Height of the plot
|
|
312
|
+
- res (type=int): Resolution of the plot
|
|
313
|
+
"""
|
|
314
|
+
input = "indir:dir"
|
|
315
|
+
output = "outdir:dir:{{in.indir | stem}}.ibd"
|
|
316
|
+
lang = config.lang.rscript
|
|
317
|
+
envs = {
|
|
318
|
+
"plink": config.exe.plink,
|
|
319
|
+
"ncores": config.misc.ncores,
|
|
320
|
+
"highld": None,
|
|
321
|
+
"samid": "{fid}_{iid}",
|
|
322
|
+
"indep": [50, 5, 0.2],
|
|
323
|
+
"pihat": 0.1875,
|
|
324
|
+
"plot": True,
|
|
325
|
+
"anno": None,
|
|
326
|
+
"seed": 8525,
|
|
327
|
+
"devpars": {"width": 1000, "height": 1000, "res": 100},
|
|
328
|
+
}
|
|
329
|
+
script = "file://../scripts/snp/PlinkIBD.R"
|
|
330
|
+
plugin_opts = {"report": "file://../reports/snp/PlinkIBD.svelte"}
|
|
331
|
+
|
|
332
|
+
|
|
333
|
+
class PlinkHWE(Proc):
|
|
334
|
+
"""Hardy-Weinberg Equilibrium report and filtering
|
|
335
|
+
|
|
336
|
+
See also <https://www.cog-genomics.org/plink/2.0/basic_stats#hardy>
|
|
337
|
+
|
|
338
|
+
Input:
|
|
339
|
+
indir: Input directory containing the PLINK files.
|
|
340
|
+
Including `.bed`, `.bim`, and `.fam` files
|
|
341
|
+
|
|
342
|
+
Output:
|
|
343
|
+
outdir: Output file containing the HWE results.
|
|
344
|
+
Including [`.hwe`](https://www.cog-genomics.org/plink/2.0/formats#hwe)
|
|
345
|
+
file for the original HWE report from PLINK and
|
|
346
|
+
`.hardy.fail` for the variants that failed the HWE test.
|
|
347
|
+
It also includes binary files `.bed`, `.bim`, and `.fam`
|
|
348
|
+
|
|
349
|
+
Envs:
|
|
350
|
+
plink: Path to PLINK v2
|
|
351
|
+
ncores (type=int): Number of cores/threads to use, will pass to plink
|
|
352
|
+
`--threads` option
|
|
353
|
+
cutoff (type=float): P-value cutoff for HWE test
|
|
354
|
+
plot (flag): If set, plot the distribution of HWE p-values.
|
|
355
|
+
devpars (ns): The device parameters for the plot.
|
|
356
|
+
- width (type=int): Width of the plot
|
|
357
|
+
- height (type=int): Height of the plot
|
|
358
|
+
- res (type=int): Resolution of the plot
|
|
359
|
+
"""
|
|
360
|
+
input = "indir:dir"
|
|
361
|
+
output = "outdir:dir:{{in.indir | stem}}.hwe"
|
|
362
|
+
lang = config.lang.rscript
|
|
363
|
+
envs = {
|
|
364
|
+
"plink": config.exe.plink2,
|
|
365
|
+
"ncores": config.misc.ncores,
|
|
366
|
+
"cutoff": 1e-5,
|
|
367
|
+
"plot": True,
|
|
368
|
+
"devpars": {"width": 1000, "height": 800, "res": 100},
|
|
369
|
+
}
|
|
370
|
+
script = "file://../scripts/snp/PlinkHWE.R"
|
|
371
|
+
plugin_opts = {"report": "file://../reports/snp/PlinkHWE.svelte"}
|
|
372
|
+
|
|
373
|
+
|
|
374
|
+
class PlinkHet(Proc):
|
|
375
|
+
"""Calculation of sample heterozygosity.
|
|
376
|
+
|
|
377
|
+
Input:
|
|
378
|
+
indir: Input directory containing the PLINK files.
|
|
379
|
+
Including `.bed`, `.bim`, and `.fam` files
|
|
380
|
+
|
|
381
|
+
Output:
|
|
382
|
+
outdir: Output file containing the heterozygosity results.
|
|
383
|
+
Including [`.het`](https://www.cog-genomics.org/plink/2.0/formats#het)
|
|
384
|
+
file for the original heterozygosity report from PLINK and
|
|
385
|
+
`.het.fail` for the samples that failed the heterozygosity test.
|
|
386
|
+
It also includes binary files `.bed`, `.bim`, and `.fam`
|
|
387
|
+
|
|
388
|
+
Envs:
|
|
389
|
+
plink: Path to PLINK v2, at least v2.00a5.10
|
|
390
|
+
ncores (type=int): Number of cores/threads to use, will pass to plink
|
|
391
|
+
`--threads` option
|
|
392
|
+
cutoff (type=float): Heterozygosity cutoff, samples with heterozygosity
|
|
393
|
+
beyond `mean - cutoff * sd` or `mean + cutoff * sd` will be considered
|
|
394
|
+
as outliers.
|
|
395
|
+
plot (flag): If set, plot the distribution of heterozygosity values.
|
|
396
|
+
devpars (ns): The device parameters for the plot.
|
|
397
|
+
- width (type=int): Width of the plot
|
|
398
|
+
- height (type=int): Height of the plot
|
|
399
|
+
- res (type=int): Resolution of the plot
|
|
400
|
+
"""
|
|
401
|
+
input = "indir:dir"
|
|
402
|
+
output = "outdir:dir:{{in.indir | stem}}.het"
|
|
403
|
+
lang = config.lang.rscript
|
|
404
|
+
envs = {
|
|
405
|
+
"plink": config.exe.plink2,
|
|
406
|
+
"ncores": config.misc.ncores,
|
|
407
|
+
"cutoff": 3.0,
|
|
408
|
+
"plot": True,
|
|
409
|
+
"devpars": {"width": 1000, "height": 800, "res": 100},
|
|
410
|
+
}
|
|
411
|
+
script = "file://../scripts/snp/PlinkHet.R"
|
|
412
|
+
plugin_opts = {"report": "file://../reports/snp/PlinkHet.svelte"}
|
|
413
|
+
|
|
414
|
+
|
|
415
|
+
class PlinkCallRate(Proc):
|
|
416
|
+
"""Calculation of call rate for the samples and variants.
|
|
417
|
+
|
|
418
|
+
Input:
|
|
419
|
+
indir: Input directory containing the PLINK files.
|
|
420
|
+
Including `.bed`, `.bim`, and `.fam` files
|
|
421
|
+
|
|
422
|
+
Output:
|
|
423
|
+
outdir: Output file containing the call rate results.
|
|
424
|
+
Including [`.imiss`](https://www.cog-genomics.org/plink/2.0/formats#imiss)
|
|
425
|
+
file for missing calls for samples,
|
|
426
|
+
[`.lmiss`](https://www.cog-genomics.org/plink/2.0/formats#lmiss) for
|
|
427
|
+
missing calls for variants, `.samplecr.fail` for the samples fail
|
|
428
|
+
sample call rate cutoff (`args.samplecr`), and `.varcr.fail` for the SNPs
|
|
429
|
+
fail snp call rate cutoff (`args.varcr`).
|
|
430
|
+
It also includes binary files `.bed`, `.bim`, and `.fam`.
|
|
431
|
+
|
|
432
|
+
Envs:
|
|
433
|
+
plink: Path to PLINK v2
|
|
434
|
+
ncores (type=int): Number of cores/threads to use, will pass to plink
|
|
435
|
+
`--threads` option
|
|
436
|
+
samplecr (type=float): Sample call rate cutoff
|
|
437
|
+
varcr (type=float): Variant call rate cutoff
|
|
438
|
+
max_iter (type=int): Maximum number of iterations to run the call rate
|
|
439
|
+
calculation.
|
|
440
|
+
Since the sample and variant call rates are affected by each other,
|
|
441
|
+
it may be necessary to iterate the calculation to get the stable results.
|
|
442
|
+
plot (flag): If set, plot the distribution of call rates.
|
|
443
|
+
devpars (ns): The device parameters for the plot.
|
|
444
|
+
- width (type=int): Width of the plot
|
|
445
|
+
- height (type=int): Height of the plot
|
|
446
|
+
- res (type=int): Resolution of the plot
|
|
447
|
+
"""
|
|
448
|
+
input = "indir:dir"
|
|
449
|
+
output = "outdir:dir:{{in.indir | stem}}.callrate"
|
|
450
|
+
lang = config.lang.rscript
|
|
451
|
+
envs = {
|
|
452
|
+
"plink": config.exe.plink2,
|
|
453
|
+
"ncores": config.misc.ncores,
|
|
454
|
+
"samplecr": 0.95,
|
|
455
|
+
"varcr": 0.95,
|
|
456
|
+
"max_iter": 3,
|
|
457
|
+
"plot": True,
|
|
458
|
+
"devpars": {"width": 1000, "height": 800, "res": 100},
|
|
459
|
+
}
|
|
460
|
+
script = "file://../scripts/snp/PlinkCallRate.R"
|
|
461
|
+
plugin_opts = {"report": "file://../reports/snp/PlinkCallRate.svelte"}
|
|
462
|
+
|
|
463
|
+
|
|
464
|
+
class PlinkFilter(Proc):
|
|
465
|
+
"""Filter samples and variants for PLINK files.
|
|
466
|
+
|
|
467
|
+
Input:
|
|
468
|
+
indir: Input directory containing the PLINK files.
|
|
469
|
+
Including `.bed`, `.bim`, and `.fam` files
|
|
470
|
+
samples_file: File containing the sample IDs.
|
|
471
|
+
variants_file: File containing the variant IDs or regions.
|
|
472
|
+
|
|
473
|
+
Output:
|
|
474
|
+
outdir: Output directory containing the filtered PLINK files.
|
|
475
|
+
Including `.bed`, `.bim`, and `.fam` files
|
|
476
|
+
|
|
477
|
+
Envs:
|
|
478
|
+
plink: Path to PLINK v2
|
|
479
|
+
ncores (type=int): Number of cores/threads to use, will pass to plink
|
|
480
|
+
`--threads` option
|
|
481
|
+
samples (auto): Sample IDs.
|
|
482
|
+
If both FID and IID should be provided and separatedby `/`. Otherwise,
|
|
483
|
+
assuming the same FID and IID.
|
|
484
|
+
A list of sample IDs or string concatenated by `,`.
|
|
485
|
+
If either `in.samples_file` or `envs.samples_file` is set,
|
|
486
|
+
this will be ignored.
|
|
487
|
+
variants (auto): Variant IDs.
|
|
488
|
+
A list of variant IDs or string concatenated by `,`.
|
|
489
|
+
If either `in.variants_file` or `envs.variants_file` is set,
|
|
490
|
+
this will be ignored.
|
|
491
|
+
samples_file: File containing the sample IDs.
|
|
492
|
+
If `in.samples_file` is set, this will be ignored.
|
|
493
|
+
variants_file: File containing the variant IDs.
|
|
494
|
+
If `in.variants_file` is set, this will be ignored.
|
|
495
|
+
keep (flag): Use `samples`/`variants`/`samples_file`/`variants_file` to
|
|
496
|
+
only keep the specified samples/variants, instead of removing them.
|
|
497
|
+
vfile_type (choice): The type of the variants file.
|
|
498
|
+
- id: Variant IDs
|
|
499
|
+
- bed0: 0-based BED file
|
|
500
|
+
- bed1: 1-based BED file
|
|
501
|
+
chr: Chromosome to keep.
|
|
502
|
+
For example, `1-4 22 XY` will keep chromosomes 1 to 4, 22, and XY.
|
|
503
|
+
not_chr: Chromosome to remove.
|
|
504
|
+
For example, `1-4 22 XY` will remove chromosomes 1 to 4, 22, and XY.
|
|
505
|
+
autosome (flag): Excludes all unplaced and non-autosomal variants
|
|
506
|
+
autosome_xy (flag): Does `autosome` but does not exclude the pseudo-autosomal
|
|
507
|
+
region of X.
|
|
508
|
+
snps_only (auto): Excludes all variants with one or more multi-character
|
|
509
|
+
allele codes. With 'just-acgt', variants with single-character allele codes
|
|
510
|
+
outside of {'A', 'C', 'G', 'T', 'a', 'c', 'g', 't', <missing code>}
|
|
511
|
+
are also excluded.
|
|
512
|
+
"""
|
|
513
|
+
input = [
|
|
514
|
+
"indir:dir",
|
|
515
|
+
"samples_file:file",
|
|
516
|
+
"variants_file:file",
|
|
517
|
+
]
|
|
518
|
+
output = "outdir:dir:{{in.indir | stem}}.filtered"
|
|
519
|
+
lang = config.lang.python
|
|
520
|
+
envs = {
|
|
521
|
+
"plink": config.exe.plink2,
|
|
522
|
+
"ncores": config.misc.ncores,
|
|
523
|
+
"samples": None,
|
|
524
|
+
"variants": None,
|
|
525
|
+
"samples_file": None,
|
|
526
|
+
"variants_file": None,
|
|
527
|
+
"keep": False,
|
|
528
|
+
"vfile_type": "id",
|
|
529
|
+
"chr": None,
|
|
530
|
+
"not_chr": None,
|
|
531
|
+
"autosome": False,
|
|
532
|
+
"autosome_xy": False,
|
|
533
|
+
"snps_only": False,
|
|
534
|
+
}
|
|
535
|
+
script = "file://../scripts/snp/PlinkFilter.py"
|
|
536
|
+
|
|
537
|
+
|
|
538
|
+
class PlinkFreq(Proc):
|
|
539
|
+
"""Calculate allele frequencies for the variants.
|
|
540
|
+
|
|
541
|
+
Input:
|
|
542
|
+
indir: Input directory containing the PLINK files.
|
|
543
|
+
Including `.bed`, `.bim`, and `.fam` files
|
|
544
|
+
|
|
545
|
+
Output:
|
|
546
|
+
outdir: Output file containing the allele frequency results.
|
|
547
|
+
By default, it includes
|
|
548
|
+
[`.afreq`](https://www.cog-genomics.org/plink/2.0/formats#afreq)
|
|
549
|
+
file for the allele frequency report from PLINK.
|
|
550
|
+
Modifiers can be added to change this behavior.
|
|
551
|
+
See `envs.modifier` for more information.
|
|
552
|
+
When `envs.filter != no`, it also includes binary files `.bed`, `.bim`,
|
|
553
|
+
and `.fam` after filtering with `envs.cutoff`.
|
|
554
|
+
|
|
555
|
+
Envs:
|
|
556
|
+
plink: Path to PLINK v2
|
|
557
|
+
ncores (type=int): Number of cores/threads to use, will pass to plink
|
|
558
|
+
`--threads` option
|
|
559
|
+
modifier (choice): The modifier of `--freq` to control the output behavior.
|
|
560
|
+
- none: No modifier, only the `.afreq` file will be generated.
|
|
561
|
+
`MAF` (minor allele frequency) will be added in addition to the
|
|
562
|
+
`REF_FREQ` and `ALT1_FREQ` columns. Check `.afreqx` for the added
|
|
563
|
+
columns.
|
|
564
|
+
- counts: write allele count report to `.acount`.
|
|
565
|
+
See <https://www.cog-genomics.org/plink/2.0/formats#afreq>.
|
|
566
|
+
`ALT1`, `ALT1_CT`, and `REF_CT` are added. Check `.acountx` for
|
|
567
|
+
the added columns.
|
|
568
|
+
- x: write genotype count report to `.gcount`
|
|
569
|
+
Like `--freqx` in v1.9, `--geno-counts` will be run to generate
|
|
570
|
+
the genotype counts.
|
|
571
|
+
`ALT1`, `HET_REF_ALT1_CT`, and `HOM_ALT1_CT` are added. Check
|
|
572
|
+
`.gcountx` for the added columns.
|
|
573
|
+
gz (flag): If set, compress the output files.
|
|
574
|
+
cutoff (auto): Cutoffs to mark or filter the variants.
|
|
575
|
+
If a float is given, default column will be used based on the modifier.
|
|
576
|
+
For `modifier="none"`, it defaults to `MAF`.
|
|
577
|
+
For `modifier="counts"`, it defaults to `ALT1_CT`.
|
|
578
|
+
For `modifier="x"`, it defaults to `HOM_ALT1_CT`.
|
|
579
|
+
Or this could be a dictionary to specify the column names and cutoffs.
|
|
580
|
+
For example, `{"MAF": 0.05}`.
|
|
581
|
+
filter (auto): The direction of filtering variants based on `cutoff`.
|
|
582
|
+
If a single value is given, it will apply to all columns provided in
|
|
583
|
+
`cutoff`. If a dictionary is given, it will apply to the corresponding
|
|
584
|
+
column. If a column cannot be found in the dictionary, it defaults to
|
|
585
|
+
`no`.
|
|
586
|
+
no: Do not filter variants (no binary files are generated in outdir).
|
|
587
|
+
gt: Filter variants with MAF greater than `cutoff`.
|
|
588
|
+
lt: Filter variants with MAF less than `cutoff`.
|
|
589
|
+
ge: Filter variants with MAF greater than or equal to `cutoff`.
|
|
590
|
+
le: Filter variants with MAF less than or equal to `cutoff`.
|
|
591
|
+
plot (flag): If set, plot the distribution of allele frequencies.
|
|
592
|
+
devpars (ns): The device parameters for the plot.
|
|
593
|
+
- width (type=int): Width of the plot
|
|
594
|
+
- height (type=int): Height of the plot
|
|
595
|
+
- res (type=int): Resolution of the plot
|
|
596
|
+
"""
|
|
597
|
+
input = "indir:dir"
|
|
598
|
+
output = "outdir:dir:{{in.indir | stem}}.freq"
|
|
599
|
+
lang = config.lang.rscript
|
|
600
|
+
envs = {
|
|
601
|
+
"plink": config.exe.plink2,
|
|
602
|
+
"ncores": config.misc.ncores,
|
|
603
|
+
"modifier": "none",
|
|
604
|
+
"gz": False,
|
|
605
|
+
"cutoff": {},
|
|
606
|
+
"filter": {},
|
|
607
|
+
"plot": True,
|
|
608
|
+
"devpars": {"width": 1000, "height": 800, "res": 100},
|
|
609
|
+
}
|
|
610
|
+
script = "file://../scripts/snp/PlinkFreq.R"
|
|
611
|
+
plugin_opts = {"report": "file://../reports/snp/PlinkFreq.svelte"}
|
|
612
|
+
|
|
613
|
+
|
|
614
|
+
class PlinkUpdateName(Proc):
|
|
615
|
+
"""Update variant names in PLINK files.
|
|
616
|
+
|
|
617
|
+
See also <https://www.cog-genomics.org/plink/2.0/data#update_map>.
|
|
618
|
+
|
|
619
|
+
Input:
|
|
620
|
+
indir: Input directory containing the PLINK files.
|
|
621
|
+
Including `.bed`, `.bim`, and `.fam` files
|
|
622
|
+
namefile: File containing the variant names to update.
|
|
623
|
+
Either a file containing two columns, the first column is the old
|
|
624
|
+
variant name, and the second column is the new variant name.
|
|
625
|
+
Or a VCF file containing the variant names to update.
|
|
626
|
+
When a VCF file is given, the chromosome, position, and reference and
|
|
627
|
+
alternate alleles will be used to match the variants.
|
|
628
|
+
|
|
629
|
+
Output:
|
|
630
|
+
outdir: Output directory containing the updated PLINK files.
|
|
631
|
+
Including `.bed`, `.bim`, and `.fam` files
|
|
632
|
+
|
|
633
|
+
Envs:
|
|
634
|
+
ncores: Number of cores/threads to use, will pass to plink `--threads` option
|
|
635
|
+
plink: Path to PLINK v2
|
|
636
|
+
bcftools: Path to bcftools
|
|
637
|
+
match_alt (choice): How to match alternate alleles when `in.namefile`
|
|
638
|
+
is a VCF file.
|
|
639
|
+
- exact: Matches alternate alleles exactly.
|
|
640
|
+
- all: Matches alternate alleles regardless of the order.
|
|
641
|
+
`chr1:100:A:T,G` matches `chr1:100:A:G,T` or `chr1:100:A:T,G`.
|
|
642
|
+
- any: Matches any alternate allele.
|
|
643
|
+
For example, `chr1:100:A:T,G` matches `chr1:100:A:G,C`
|
|
644
|
+
- first_included: Matches when the first allele is included.
|
|
645
|
+
For example, `chr1:100:A:T,G` matches `chr1:100:A:C,T`.
|
|
646
|
+
- first: Match first alternate allele
|
|
647
|
+
For example, `chr1:100:A:T,G` matches `chr1:100:A:T`.
|
|
648
|
+
- none: Do not match alternate alleles
|
|
649
|
+
"""
|
|
650
|
+
input = "indir:dir, namefile:file"
|
|
651
|
+
output = "outdir:dir:{{in.indir | stem}}.newnames"
|
|
652
|
+
lang = config.lang.python
|
|
653
|
+
envs = {
|
|
654
|
+
"ncores": config.misc.ncores,
|
|
655
|
+
"plink": config.exe.plink2,
|
|
656
|
+
"bcftools": config.exe.bcftools,
|
|
657
|
+
"match_alt": "exact",
|
|
658
|
+
}
|
|
659
|
+
script = "file://../scripts/snp/PlinkUpdateName.py"
|