PyPI - biopipen - Versions diffs - 0.31.4__py3-none-any.whl → 0.31.6__py3-none-any.whl - Mend

biopipen 0.31.4py3-none-any.whl → 0.31.6py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of biopipen might be problematic. Click here for more details.

Files changed (23) hide show

biopipen/__init__.py +1 -1
biopipen/ns/bam.py +41 -0
biopipen/ns/protein.py +84 -0
biopipen/ns/regulatory.py +72 -0
biopipen/ns/vcf.py +7 -3
biopipen/reports/protein/ProdigySummary.svelte +16 -0
biopipen/scripts/bam/BamMerge.py +10 -14
biopipen/scripts/bam/BamSampling.py +90 -0
biopipen/scripts/protein/Prodigy.py +119 -0
biopipen/scripts/protein/ProdigySummary.R +133 -0
biopipen/scripts/regulatory/MotifAffinityTest.R +5 -143
biopipen/scripts/regulatory/MotifAffinityTest_AtSNP.R +31 -37
biopipen/scripts/regulatory/MotifAffinityTest_MotifBreakR.R +25 -26
biopipen/scripts/regulatory/VariantMotifPlot.R +76 -0
biopipen/scripts/regulatory/motifs-common.R +322 -0
biopipen/scripts/vcf/TruvariBench.sh +14 -7
biopipen/scripts/vcf/TruvariBenchSummary.R +1 -2
{biopipen-0.31.4.dist-info → biopipen-0.31.6.dist-info}/METADATA +1 -1
{biopipen-0.31.4.dist-info → biopipen-0.31.6.dist-info}/RECORD +21 -16
{biopipen-0.31.4.dist-info → biopipen-0.31.6.dist-info}/entry_points.txt +1 -0
biopipen/scripts/regulatory/atSNP.R +0 -33
biopipen/scripts/regulatory/motifBreakR.R +0 -1594
{biopipen-0.31.4.dist-info → biopipen-0.31.6.dist-info}/WHEEL +0 -0

biopipen/__init__.py CHANGED Viewed

	@@ -1 +1 @@
1	- __version__ = "0.31.4"
1	+ __version__ = "0.31.6"

biopipen/ns/bam.py CHANGED Viewed

@@ -260,3 +260,44 @@ class BamMerge(Proc):
         "sort_args": [],
     }
     script = "file://../scripts/bam/BamMerge.py"
+class BamSampling(Proc):
+    """Keeping only a fraction of read pairs from a bam file
+    Input:
+        bamfile: The bam file
+    Output:
+        outfile: The output bam file
+    Envs:
+        ncores: Number of cores to use
+        samtools: Path to samtools executable
+        tool: The tool to use, currently only "samtools" is supported
+        fraction (type=float): The fraction of reads to keep.
+            If `0 < fraction <= 1`, it's the fraction of reads to keep.
+            If `fraction > 1`, it's the number of reads to keep.
+            Note that when fraction > 1, you may not get the exact number
+            of reads specified but a close number.
+        seed: The seed for random number generator
+        index: Whether to index the output bam file
+        sort: Whether to sort the output bam file
+        sort_args: The arguments for sorting bam file using `samtools sort`.
+            These keys are not allowed: `-o`, `-@`,
+            and `--threads`, as they are managed by the script.
+    """
+    input = "bamfile:file"
+    output = "outfile:file:{{in.bamfile | stem}}.sampled{{envs.fraction}}.bam"
+    lang = config.lang.python
+    envs = {
+        "ncores": config.misc.ncores,
+        "samtools": config.exe.samtools,
+        "tool": "samtools",
+        "fraction": None,
+        "seed": 8525,
+        "index": True,
+        "sort": True,
+        "sort_args": [],
+    }
+    script = "file://../scripts/bam/BamSampling.py"

biopipen/ns/protein.py ADDED Viewed

@@ -0,0 +1,84 @@
+"""Protein-related processes."""
+from ..core.proc import Proc
+from ..core.config import config
+class Prodigy(Proc):
+    """Prediction of binding affinity of protein-protein complexes based on
+    intermolecular contacts using Prodigy.
+    See <https://rascar.science.uu.nl/prodigy/> and
+    <https://github.com/haddocking/prodigy>.
+    `prodigy-prot` must be installed under the given python of `proc.lang`.
+    Input:
+        infile: The structure file in PDB or mmCIF format.
+    Output:
+        outfile: The output file generated by Prodigy.
+        outdir: The output directory containing all output files.
+    Envs:
+        distance_cutoff (type=float): The distance cutoff to calculate intermolecular
+            contacts.
+        acc_threshold (type=float): The accessibility threshold for BSA analysis.
+        temperature (type=float): The temperature (C) for Kd prediction.
+        contact_list (flag): Whether to generate contact list.
+        pymol_selection (flag): Whether output a script to highlight the interface
+            residues in PyMOL.
+        selection (list): The selection of the chains to analyze.
+            `['A', 'B']` will analyze chains A and B.
+            `['A,B', 'C']` will analyze chain A and C; and B and C.
+            `['A', 'B', 'C']` will analyze all combinations of A, B, and C.
+        outtype (choice): Set the format of the output file (`out.outfile`).
+            All three files will be generated. This option only determines which
+            is assigned to `out.outfile`.
+            - raw: The raw output file from prodigy.
+            - json: The output file in JSON format.
+            - tsv: The output file in CSV format.
+    """
+    input = "infile:file"
+    output = [
+        "outfile:file:{{in.infile | stem}}_prodigy/"
+        "{{in.infile | stem}}.{{envs.outtype if envs.outtype != 'raw' else 'out'}}",
+        "outdir:dir:{{in.infile | stem}}_prodigy",
+    ]
+    lang = config.lang.python
+    envs = {
+        "distance_cutoff": 5.5,
+        "acc_threshold": 0.05,
+        "temperature": 25.0,
+        "contact_list": True,
+        "pymol_selection": True,
+        "selection": None,
+        "outtype": "json",
+    }
+    script = "file://../scripts/protein/Prodigy.py"
+class ProdigySummary(Proc):
+    """Summary of the output from `Prodigy`.
+    Input:
+        infiles: The output json file generated by `Prodigy`.
+    Output:
+        outdir: The directory of summary files generated by `ProdigySummary`.
+    Envs:
+        group (type=auto): The group of the samples for boxplots.
+            If `None`, don't do boxplots.
+            It can be a dict of group names and sample names, e.g.
+            `{"group1": ["sample1", "sample2"], "group2": ["sample3"]}`
+            or a file containing the group information, with the first column
+            being the sample names and the second column being the group names.
+            The file should be tab-delimited with no header.
+    """
+    input = "infiles:files"
+    input_data = lambda ch: [[f"{odir}/_prodigy.tsv" for odir in ch.outdir]]
+    output = "outdir:dir:prodigy_summary"
+    lang = config.lang.rscript
+    envs = {"group": None}
+    script = "file://../scripts/protein/ProdigySummary.R"
+    plugin_opts = {"report": "file://../reports/protein/ProdigySummary.svelte"}

biopipen/ns/regulatory.py CHANGED Viewed

@@ -212,3 +212,75 @@ class MotifAffinityTest(Proc):
         "atsnp_args": {"padj_cutoff": True, "padj": "BH", "p": "pval_diff"},
     }
     script = "file://../scripts/regulatory/MotifAffinityTest.R"
+class VariantMotifPlot(Proc):
+    """A plot with a genomic region surrounding a genomic variant, and
+    potentially disrupted motifs.
+    Currently only SNVs are supported.
+    Input:
+        infile: File containing the variants and motifs.
+            It is a TAB-delimited file with the following columns:
+            - chrom: The chromosome of the SNV. Alias: chr, seqnames.
+            - start: The start position of the SNV, no matter 0- or 1-based.
+            - end: The end position of the SNV, which will be used as the position of the SNV.
+            - strand: Indicating the direction of the surrounding sequence matching the motif.
+            - SNP_id: The name of the SNV.
+            - REF: The reference allele of the SNV.
+            - ALT: The alternative allele of the SNV.
+            - providerId: The motif id. It can be specified by `envs.motif_col`.
+            - providerName: The name of the motif provider. Optional.
+            - Regulator: The regulator name. Optional, can be specified by `envs.regulator_col`.
+            - motifPos: The position of the motif, relative to the position of the SNV.
+                For example, '-8, 4' means the motif is 8 bp upstream and 4 bp downstream of the SNV.
+    Envs:
+        genome: The genome assembly.
+            Used to fetch the sequences around the variants by package, for example, `BSgenome.Hsapiens.UCSC.hg19` is required if
+            `hg19`. If it is an organism other than human, please specify the full name of the package, for example, `BSgenome.Mmusculus.UCSC.mm10`.
+        motifdb: The path to the motif database. This is required.
+            It should be in the format of MEME motif database.
+            Databases can be downloaded here: <https://meme-suite.org/meme/doc/download.html>.
+            See also introduction to the databases: <https://meme-suite.org/meme/db/motifs>.
+            [universalmotif](https://github.com/bjmt/universalmotif) is required to read the motif database.
+        motif_col: The column name in the motif file containing the motif names.
+            If this is not provided, `envs.regulator_col` and `envs.regmotifs` are required,
+            which are used to infer the motif names from the regulator names.
+        regulator_col: The column name in the motif file containing the regulator names.
+            Both `motif_col` and `regulator_col` should be the direct column names or
+            the index (1-based) of the columns.
+            If no `regulator_col` is provided, no regulator information is written in
+            the output. Otherwise, the regulator information is written in the output in
+            the `Regulator` column.
+        regmotifs: The path to the regulator-motif mapping file.
+            It must have header and the columns `Motif` or `Model` for motif names and
+            `TF`, `Regulator` or `Transcription factor`  for regulator names.
+        notfound (choice): What to do if a motif is not found in the database,
+            or a regulator is not found in the regulator-motif mapping (envs.regmotifs)
+            file.
+            - error: Report error and stop the process.
+            - ignore: Ignore the motif and continue.
+        devpars (ns): The default device parameters for the plot.
+            - width (type=int): The width of the plot.
+            - height (type=int): The height of the plot.
+            - res (type=int): The resolution of the plot.
+        plot_vars (type=auto): The variants (SNP_id) to plot.
+            A list of variant names to plot or a string with the variant names separated by comma.
+            When not specified, all variants are plotted.
+    """  # noqa: E501
+    input = "infile:file"
+    output = "outdir:dir:{{in.infile | stem}}.vmplots"
+    lang = config.lang.rscript
+    envs = {
+        "genome": config.ref.genome,
+        "motifdb": config.ref.tf_motifdb,
+        "motif_col": "providerId",
+        "regulator_col": None,
+        "regmotifs": config.ref.tf_motifs,
+        "notfound": "error",
+        "devpars": {"width": 800, "height": None, "res": 100},
+        "plot_vars": None,
+    }
+    script = "file://../scripts/regulatory/VariantMotifPlot.R"

biopipen/ns/vcf.py CHANGED Viewed

@@ -335,6 +335,8 @@ class TruvariBench(Proc):
     """Run `truvari bench` to compare a VCF with CNV calls and
     base CNV standards
+    Requires truvari v4+
     See https://github.com/ACEnglish/truvari/wiki/bench
     Input:
@@ -358,7 +360,7 @@ class TruvariBench(Proc):
         "truvari": config.exe.truvari,
         "ref": config.ref.reffa,
         "refdist": 500,
-        "pctsim": 0.7,
+        "pctseq": 0.7,
         "pctsize": 0.7,
         "pctovl": 0.0,
         "typeignore": False,
@@ -402,7 +404,7 @@ class TruvariBenchSummary(Proc):
     output = "outdir:dir:truvari_bench.summary"
     lang = config.lang.rscript
     envs = {
-        "plots": ["call cnt", "base cnt", "precision", "recall", "f1"],
+        "plots": ["comp cnt", "base cnt", "precision", "recall", "f1"],
         "devpars": None,
     }
     script = "file://../scripts/vcf/TruvariBenchSummary.R"
@@ -414,6 +416,8 @@ class TruvariConsistency(Proc):
     See https://github.com/ACEnglish/truvari/wiki/consistency
+    Requires truvari v4+
     Input:
         vcfs: The vcf files with CNV calls
@@ -463,7 +467,7 @@ class BcftoolsAnnotate(Proc):
         columns (auto): Comma-separated or list of columns or tags to carry over from
             the annotation file. Overrides `-c, --columns`
         remove (auto): Remove the specified columns from the input file
-        header (type=list): Headers to be added
+        header (list): Headers to be added
         gz (flag): Whether to gzip the output file
         index (flag): Whether to index the output file (tbi) (`envs.gz` forced to True)
         <more>: Other arguments for `bcftools annotate`

biopipen/reports/protein/ProdigySummary.svelte ADDED Viewed

@@ -0,0 +1,16 @@
+{% from "utils/misc.liq" import report_jobs -%}
+<script>
+    import { Image, DataTable, Descr } from "$libs";
+</script>
+{%- macro report_job(job, h=1) -%}
+    {{ job | render_job: h=h }}
+{%- endmacro -%}
+{%- macro head_job(job) -%}
+    <h1>{{job.out.outdir | stem | escape}}</h1>
+{%- endmacro -%}
+{{ report_jobs(jobs, head_job, report_job) }}

biopipen/scripts/bam/BamMerge.py CHANGED Viewed

@@ -1,7 +1,7 @@
 from pathlib import Path
-from biopipen.utils.misc import run_command
+from biopipen.utils.misc import run_command, logger
-bamfiles = {{in.bamfiles | repr}}  # pyright: ignore
+bamfiles = {{in.bamfiles | repr}}  # pyright: ignore # noqa
 outfile = Path({{out.outfile | repr}})  # pyright: ignore
 ncores = {{envs.ncores | int}}  # pyright: ignore
 tool = {{envs.tool | quote}}  # pyright: ignore
@@ -18,7 +18,7 @@ if should_index and not should_sort:
 def use_samtools():
     """Use samtools to merge bam files"""
-    print("Using samtools")
+    logger.info("Using samtools ...")
     ofile = (
         outfile
         if not should_sort
@@ -43,11 +43,11 @@ def use_samtools():
         *merge_args,
         *bamfiles,
     ]
-    print("- Merging")
+    logger.info("- Merging the bam files ...")
     run_command(cmd)
     if should_sort:
-        print("- Sorting")
+        logger.info("- Sorting the merged bam file ...")
         for key in ["-o", "-@", "--threads"]:
             if key in sort_args:
                 raise ValueError(
@@ -67,16 +67,14 @@ def use_samtools():
         run_command(cmd)
     if should_index:
-        print("- Indexing")
+        logger.info("- Indexing the output bam file ...")
         cmd = [samtools, "index", "-@", ncores, outfile]
         run_command(cmd)
-    print("Done")
 def use_sambamba():
     """Use sambamba to merge bam files"""
-    print("Using sambamba")
+    logger.info("Using sambamba ...")
     ofile = (
         outfile
         if not should_sort
@@ -90,11 +88,11 @@ def use_sambamba():
             )
     cmd = [sambamba, "merge", "-t", ncores, *merge_args, ofile, *bamfiles]
-    print("- Merging")
+    logger.info("- Merging the bam files ...")
     run_command(cmd)
     if should_sort:
-        print("- Sorting")
+        logger.info("- Sorting the merged bam file ...")
         for key in ["-t", "--nthreads", "-o", "--out"]:
             if key in sort_args:
                 raise ValueError(
@@ -115,12 +113,10 @@ def use_sambamba():
         run_command(cmd)
     if should_index:
-        print("- Indexing")
+        logger.info("- Indexing the output bam file ...")
         cmd = [sambamba, "index", "-t", ncores, outfile]
         run_command(cmd)
-    print("Done")
 if __name__ == "__main__":
     if tool == "samtools":

biopipen/scripts/bam/BamSampling.py ADDED Viewed

@@ -0,0 +1,90 @@
+from pathlib import Path
+from biopipen.utils.misc import run_command, logger
+# using:
+# samtools view --subsample 0.1 --subsample-seed 1234 --threads 4 -b -o out.bam in.bam
+bamfile = {{ in.bamfile | repr }} # pyright: ignore # noqa
+outfile = Path({{ out.outfile | repr }}) # pyright: ignore
+ncores = {{ envs.ncores | int }} # pyright: ignore
+samtools = {{ envs.samtools | repr }} # pyright: ignore
+tool = {{ envs.tool | repr }} # pyright: ignore
+fraction = {{ envs.fraction | repr }} # pyright: ignore
+seed = {{ envs.seed | int }} # pyright: ignore
+should_index = {{ envs.index | repr }} # pyright: ignore
+should_sort = {{ envs.sort | repr }} # pyright: ignore
+sort_args = {{ envs.sort_args | repr }} # pyright: ignore
+if should_index and not should_sort:
+    raise ValueError("Indexing requires sorting")
+if fraction is None:
+    raise ValueError("'envs.fraction' must be provided.")
+if tool != "samtools":
+    raise ValueError(
+        f"Tool {tool} is not supported. "
+        "Currently only samtools is supported."
+    )
+if fraction > 1:
+    # calculate the fraction based on the number of reads
+    logger.info("Converting fraction > 1 to a fraction of reads.")
+    cmd = [
+        samtools,
+        "view",
+        "--threads",
+        ncores,
+        "-c",
+        bamfile
+    ]
+    nreads = run_command(cmd, stdout="return").strip()
+    fraction = fraction / float(int(nreads))
+ofile = (
+    outfile
+    if not should_sort
+    else outfile.with_stem(f"{outfile.stem}.unsorted")
+)
+cmd = [
+    samtools,
+    "view",
+    "--subsample",
+    fraction,
+    "--subsample-seed",
+    seed,
+    "--threads",
+    ncores,
+    "-b",
+    "-o",
+    ofile,
+    bamfile
+]
+run_command(cmd, fg=True)
+if should_sort:
+    logger.info("Sorting the output bam file.")
+    for key in ["-o", "-@", "--threads"]:
+        if key in sort_args:
+            raise ValueError(
+                f"envs.sort_args cannot contain {key}, "
+                "which is managed by the script"
+            )
+    cmd = [
+        samtools,
+        "sort",
+        "-@",
+        ncores,
+        *sort_args,
+        "-o",
+        outfile,
+        ofile
+    ]
+    run_command(cmd, fg=True)
+if should_index:
+    logger.info("Indexing the output bam file.")
+    cmd = [samtools, "index", "-@", ncores, outfile]
+    run_command(cmd, fg=True)

biopipen/scripts/protein/Prodigy.py ADDED Viewed

@@ -0,0 +1,119 @@
+import json
+import logging
+import sys
+from pathlib import Path
+from prodigy_prot.predict_IC import (
+    Prodigy,
+    check_path,
+    parse_structure,
+)
+infile = {{in.infile | repr}}  # pyright: ignore # noqa
+outfile = {{out.outfile | repr}}  # pyright: ignore
+outdir = {{out.outdir | repr}}  # pyright: ignore
+distance_cutoff = {{envs.distance_cutoff | float}}  # pyright: ignore
+acc_threshold = {{envs.acc_threshold | float}}  # pyright: ignore
+temperature = {{envs.temperature | float}}  # pyright: ignore
+contact_list = {{envs.contact_list | repr}}  # pyright: ignore
+pymol_selection = {{envs.pymol_selection | repr}}  # pyright: ignore
+selection = {{envs.selection | repr}}  # pyright: ignore
+outtype = {{envs.outtype | repr}}  # pyright: ignore
+raw_outfile = Path(outdir) / "_prodigy_raw.txt"
+json_outfile = Path(outdir) / "_prodigy.json"
+tsv_outfile = Path(outdir) / "_prodigy.tsv"
+# log to the raw_outfile
+logging.basicConfig(level=logging.INFO, stream=sys.stdout, format="%(message)s")
+logger = logging.getLogger("Prodigy")
+if isinstance(selection, str):
+    selection = [selection]
+struct_path = check_path(infile)
+# parse structure
+structure, n_chains, n_res = parse_structure(struct_path)
+logger.info(
+    "[+] Parsed structure file {0} ({1} chains, {2} residues)".format(
+        structure.id, n_chains, n_res
+    )
+)
+prodigy = Prodigy(structure, selection, temperature)
+prodigy.predict(distance_cutoff=distance_cutoff, acc_threshold=acc_threshold)
+prodigy.print_prediction(outfile=raw_outfile, quiet=False)
+# Print out interaction network
+if contact_list:
+    prodigy.print_contacts(f"{outdir}/prodigy.ic")
+# Print out interaction network
+if pymol_selection:
+    prodigy.print_pymol_script(f"{outdir}/prodigy.pml")
+# [+] Reading structure file: <path/to/structure.cif>
+# [+] Parsed structure file <structure> (4 chains, 411 residues)
+# [+] No. of intermolecular contacts: 191
+# [+] No. of charged-charged contacts: 17
+# [+] No. of charged-polar contacts: 18
+# [+] No. of charged-apolar contacts: 60
+# [+] No. of polar-polar contacts: 5
+# [+] No. of apolar-polar contacts: 41
+# [+] No. of apolar-apolar contacts: 50
+# [+] Percentage of apolar NIS residues: 33.90
+# [+] Percentage of charged NIS residues: 30.48
+# [++] Predicted binding affinity (kcal.mol-1):    -21.3
+# [++] Predicted dissociation constant (M) at 25.0˚C:  2.3e-16
+output = {}
+with open(raw_outfile, "r") as f:
+    for line in f:
+        if line.startswith("[+"):
+            line = line.lstrip("[").lstrip("+").lstrip("]").lstrip()
+            if line.startswith("Reading structure file"):
+                continue
+            if line.startswith("Parsed structure file"):
+                continue
+            key, value = line.split(":", 1)
+            key = key.strip()
+            value = value.strip()
+            if key == "No. of intermolecular contacts":
+                output["nIC"] = int(value)
+            elif key == "No. of charged-charged contacts":
+                output["nCCC"] = int(value)
+            elif key == "No. of charged-polar contacts":
+                output["nCPC"] = int(value)
+            elif key == "No. of charged-apolar contacts":
+                output["nCAPC"] = int(value)
+            elif key == "No. of polar-polar contacts":
+                output["nPPC"] = int(value)
+            elif key == "No. of apolar-polar contacts":
+                output["nAPPC"] = int(value)
+            elif key == "No. of apolar-apolar contacts":
+                output["nAPAPC"] = int(value)
+            elif key.startswith("Percentage of apolar NIS residues"):
+                output["pANISR"] = float(value)
+            elif key.startswith("Percentage of charged NIS residues"):
+                output["pCNISR"] = float(value)
+            elif key.startswith("Predicted binding affinity"):
+                output["BindingAffinity"] = float(value)
+            elif key.startswith("Predicted dissociation constant"):
+                output["DissociationConstant"] = float(value)
+with open(json_outfile, "w") as f:
+    json.dump(output, f, indent=2)
+with open(tsv_outfile, "w") as f:
+    f.write("\t".join(output.keys()) + "\n")
+    f.write("\t".join(map(str, output.values())) + "\n")
+if outtype == "json":
+    json_outfile.rename(outfile)
+    json_outfile.symlink_to(outfile)
+elif outtype == "tsv":
+    tsv_outfile.rename(outfile)
+    tsv_outfile.symlink_to(outfile)
+else:
+    raw_outfile.rename(outfile)
+    raw_outfile.symlink_to(outfile)

biopipen 0.31.4__py3-none-any.whl → 0.31.6__py3-none-any.whl

Potentially problematic release.

biopipen 0.31.4py3-none-any.whl → 0.31.6py3-none-any.whl