PyPI - biopipen - Versions diffs - 0.22.0__tar.gz → 0.22.2__tar.gz - Mend

biopipen 0.22.0tar.gz → 0.22.2tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of biopipen might be problematic. Click here for more details.

Files changed (225) hide show

{biopipen-0.22.0 → biopipen-0.22.2}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: biopipen
-Version: 0.22.0
+Version: 0.22.2
 Summary: Bioinformatics processes/pipelines that can be run from `pipen run`
 License: MIT
 Author: pwwang

biopipen-0.22.2/biopipen/__init__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ __version__ = "0.22.2"

{biopipen-0.22.0 → biopipen-0.22.2}/biopipen/core/config.toml RENAMED Viewed

@@ -4,6 +4,8 @@
 bedtools = "bedtools"
 # bcftools to handle bcf/vcf files
 bcftools = "bcftools"
+# cellranger
+cellranger = "cellranger"
 # Control-FREEC to call cnvs
 freec = "freec"
 # liftover coordinates across genomes
@@ -59,6 +61,10 @@ liftover_chain = ""
 # tmpdir = ""
 [ref]
+# The reference for cellranger gex
+ref_cellranger_gex = ""
+# The reference for cellranger vdj
+ref_cellranger_vdj = ""
 # The reference genome
 reffa = ""
 # The directory with reference for each chromosome

{biopipen-0.22.0 → biopipen-0.22.2}/biopipen/core/filters.py RENAMED Viewed

@@ -15,6 +15,7 @@ filtermanager = FilterManager()
 @filtermanager.register
 def dict_to_cli_args(
     dic: Mapping[str, Any],
+    exclude: List[str] = None,
     prefix: str | None = None,
     sep: str | None = " ",
     dup_key: bool = True,
@@ -27,6 +28,7 @@ def dict_to_cli_args(
     Args:
         dic: The dict to convert
+        exclude: The keys to exclude
         prefix: The prefix of the keys after conversion
             Defaults to `None`, mean `-` for short keys and `--` for long keys
         sep: The separator between key and value
@@ -37,6 +39,13 @@ def dict_to_cli_args(
             If `sep` is `None` or `=`, this must be True, otherwise an error
             will be raised
         join: Whether to join the arguments into a single string
+        start_key: The key to start the arguments
+            This is useful when you want to put some arguments at the beginning
+            of the command line
+        end_key: The key to end the arguments
+            This is useful when you want to put some arguments at the end
+            of the command line
+        dashify: Whether to replace `_` with `-` in the keys
     Returns:
         The converted string or list of strings
@@ -44,6 +53,9 @@ def dict_to_cli_args(
     if sep in [None, "="] and not dup_key:
         raise ValueError("`dup_key` must be True when sep is `None` or `=`")
+    if exclude:
+        dic = {k: v for k, v in dic.items() if k not in exclude}
     starts = []
     ends = []
     out = []

biopipen-0.22.2/biopipen/ns/cellranger.py ADDED Viewed

@@ -0,0 +1,101 @@
+"""Cellranger pipeline module for BioPipen"""
+from ..core.proc import Proc
+from ..core.config import config
+class CellRangerCount(Proc):
+    """Run cellranger count
+    to count gene expression and/or feature barcode reads
+    Input:
+        fastqs: The input fastq files
+            Either a list of fastq files or a directory containing fastq files
+            If a directory is provided, it should be passed as a list with one
+            element.
+    Output:
+        outdir: The output directory
+    Envs:
+        ncores: Number of cores to use
+        cellranger: Path to cellranger
+        ref: Path of folder containing 10x-compatible transcriptome reference
+        tmpdir: Path to temporary directory, used to save the soft-lined fastq files
+            to pass to cellranger
+        include_introns: Set to false to exclude intronic reads in count.
+        <more>: Other environment variables required by `cellranger count`
+            See `cellranger count --help` for more details or
+            https://www.10xgenomics.com/support/software/cell-ranger/advanced/cr-command-line-arguments#count
+    """  # noqa: E501
+    input = "fastqs:files"
+    output = """outdir:dir:
+        {%- set fastqs = in.fastqs -%}
+        {%- if len(fastqs) == 1 and isdir(fastqs[0]) -%}
+            {%- set fastqs = fastqs[0] | glob: "*.fastq.gz" -%}
+        {%- endif -%}
+        {%- set sample = commonprefix(*fastqs) |
+            regex_replace: "_L\\d+_$", "" |
+            regex_replace: "_S\\d+$", "" -%}
+        {{- sample -}}
+    """
+    lang = config.lang.python
+    envs = {
+        "ncores": config.misc.ncores,
+        "cellranger": config.exe.cellranger,
+        "ref": config.ref.ref_cellranger_gex,
+        "tmpdir": config.path.tmpdir,
+        "include_introns": "true",
+    }
+    script = "file://../scripts/cellranger/CellRangerCount.py"
+    plugin_opts = {
+        "report": "file://../reports/cellranger/CellRangerCount.svelte",
+    }
+class CellRangerVdj(Proc):
+    """Run cellranger vdj
+    to perform sequence assembly and paired clonotype calling
+    Input:
+        fastqs: The input fastq files
+            Either a list of fastq files or a directory containing fastq files
+            If a directory is provided, it should be passed as a list with one
+            element.
+    Output:
+        outdir: The output directory
+    Envs:
+        ncores: Number of cores to use
+        cellranger: Path to cellranger
+        ref: Path of folder containing 10x-compatible transcriptome reference
+        tmpdir: Path to temporary directory, used to save the soft-lined fastq files
+            to pass to cellranger
+        <more>: Other environment variables required by `cellranger vdj`
+            See `cellranger vdj --help` for more details or
+            https://www.10xgenomics.com/support/software/cell-ranger/advanced/cr-command-line-arguments#vdj
+    """  # noqa: E501
+    input = "fastqs:files"
+    output = """outdir:dir:
+        {%- set fastqs = in.fastqs -%}
+        {%- if len(fastqs) == 1 and isdir(fastqs[0]) -%}
+            {%- set fastqs = fastqs[0] | glob: "*.fastq.gz" -%}
+        {%- endif -%}
+        {%- set sample = commonprefix(*fastqs) |
+            regex_replace: "_L\\d+_$", "" |
+            regex_replace: "_S\\d+$", "" -%}
+        {{- sample -}}
+    """
+    lang = config.lang.python
+    envs = {
+        "ncores": config.misc.ncores,
+        "cellranger": config.exe.cellranger,
+        "ref": config.ref.ref_cellranger_vdj,
+        "tmpdir": config.path.tmpdir,
+    }
+    script = "file://../scripts/cellranger/CellRangerVdj.py"
+    plugin_opts = {
+        "report": "file://../reports/cellranger/CellRangerVdj.svelte",
+    }

{biopipen-0.22.0 → biopipen-0.22.2}/biopipen/ns/scrna.py RENAMED Viewed

@@ -1422,6 +1422,8 @@ class CellTypeAnnotation(Proc):
             If the length of `cell_types` is shorter than the number of
             clusters, the remaining clusters will be kept as the original cell
             types.
+            You can also use `NA` to remove the clusters from downstream analysis. This
+            only works when `envs.newcol` is not specified.
             /// Note
             If `tool` is `direct` and `cell_types` is not specified or an empty list,

{biopipen-0.22.0 → biopipen-0.22.2}/biopipen/ns/tcr.py RENAMED Viewed

@@ -40,11 +40,13 @@ class ImmunarchLoading(Proc):
     Output:
         rdsfile: The RDS file with the data and metadata
-        metatxt: The meta data of the cells, used to attach to the Seurat object
+        metatxt: The meta data at cell level, which can be used to attach to the Seurat object
     Envs:
         prefix: The prefix to the barcodes. You can use placeholder like `{Sample}_`
-            to use the meta data from the `immunarch` object.
+            to use the meta data from the `immunarch` object. The prefixed barcodes will
+            be saved in `out.metatxt`. The `immunarch` object keeps the original barcodes, but
+            the prefix is saved at `immdata$prefix`.
             /// Note
             This option is useful because the barcodes for the cells from scRNA-seq
@@ -65,10 +67,16 @@ class ImmunarchLoading(Proc):
             paired chain data. For `single`, only TRB chain will be kept
             at `immdata$data`, information for other chains will be
             saved at `immdata$tra` and `immdata$multi`.
-        metacols (list): The columns to be exported to the text file.
+        extracols (list): The extra columns to be exported to the text file.
             You can refer to the
             [immunarch documentation](https://immunarch.com/articles/v2_data.html#immunarch-data-format)
-            for the full list of the columns.
+            to get a sense for the full list of the columns.
+            The columns may vary depending on the data source.
+            The columns from `immdata$meta` and some core columns, including
+            `Barcode`, `CDR3.aa`, `Clones`, `Proportion`, `V.name`, `J.name`, and
+            `D.name` will be exported by default. You can use this option to
+            specify the extra columns to be exported.
     """  # noqa: E501
     input = "metafile:file"
     output = [
@@ -80,7 +88,7 @@ class ImmunarchLoading(Proc):
         "tmpdir": config.path.tmpdir,
         "prefix": "{Sample}_",
         "mode": "single",
-        "metacols": ["Clones", "Proportion", "CDR3.aa"],
+        "extracols": [],
     }
     script = "file://../scripts/tcr/ImmunarchLoading.R"
@@ -322,6 +330,7 @@ class Immunarch(Proc):
         prefix: The prefix to the barcodes. You can use placeholder like `{Sample}_`
             The prefixed barcodes will be used to match the barcodes in `in.metafile`.
             Not used if `in.metafile` is not specified.
+            If `None` (default), `immdata$prefix` will be used.
         volumes (ns): Explore clonotype volume (sizes).
             - by: Groupings when visualize clonotype volumes, passed to the `.by` argument of `vis(imm_vol, .by = <values>)`.
                 Multiple columns should be separated by `,`.
@@ -682,7 +691,7 @@ class Immunarch(Proc):
     lang = config.lang.rscript
     envs = {
         "mutaters": {},
-        "prefix": "{Sample}_",
+        "prefix": None,
         # basic statistics
         "volumes": {
             "by": None,
@@ -1179,6 +1188,10 @@ class TCRClustering(Proc):
             For GIANA, using TRBV mutations is not supported
             - GIANA: by Li lab at UT Southwestern Medical Center
             - ClusTCR: by Sebastiaan Valkiers, etc
+        prefix: The prefix to the barcodes. You can use placeholder like `{Sample}_`
+            The prefixed barcodes will be used to match the barcodes in `in.metafile`.
+            Not used if `in.metafile` is not specified.
+            If `None` (default), `immdata$prefix` will be used.
         python: The path of python with `GIANA`'s dependencies installed
             or with `clusTCR` installed. Depending on the `tool` you choose.
         args (type=json): The arguments for the clustering tool
@@ -1202,6 +1215,7 @@ class TCRClustering(Proc):
     lang = config.lang.rscript
     envs = {
         "tool": "GIANA",  # or ClusTCR
+        "prefix": None,
         "on_multi": False,
         "python": config.lang.python,
         "args": {},
@@ -1507,7 +1521,8 @@ class TESSA(Proc):
             [link](https://www.nature.com/articles/s42256-021-00383-2)
     Input:
-        immdata: The data loaded by `immunarch::repLoad()`, saved in RDS format
+        immdata: The immunarch object in RDS file or text file of TCR data loaded by
+            [`ImmunarchLoading`](!!#biopipennstcrimmunarchloading)
         srtobj: The `Seurat` object, saved in RDS format, with dimension
             reduction performed if you want to use them to represent the
             transcriptome of T cells.
@@ -1522,8 +1537,13 @@ class TESSA(Proc):
     Envs:
         python: The path of python with `TESSA`'s dependencies installed
-        prefix: The prefix to the barcodes of TCR data. You can use placeholder
-            like `{Sample}_` to use the meta data from the immunarch object.
+        prefix: The prefix of the cell barcodes in the `Seurat` object.
+            Once could use a fixed prefix, or a placeholder with the column
+            name in meta data. For example, `"{Sample}_"` will replace the
+            placeholder with the value of the column `Sample` in meta data.
+            If `in.immdata` is text file, the prefix will be ignored and the
+            barcode should be already prefixed.
+            If `None` and `in.immdata` is RDS file, `immdata$prefix` will be used.
         within_sample (flag): Whether the TCR networks are constructed only
             within TCRs from the same sample/patient (True) or with all the
             TCRs in the meta data matrix (False).
@@ -1548,7 +1568,7 @@ class TESSA(Proc):
     lang = config.lang.rscript
     envs = {
         "python": config.lang.python,
-        "prefix": "{Sample}_",
+        "prefix": None,
         "assay": "RNA",
         "within_sample": False,
         "predefined_b": False,

biopipen-0.22.2/biopipen/reports/cellranger/CellRangerCount.svelte ADDED Viewed

@@ -0,0 +1,16 @@
+{% from "utils/misc.liq" import report_jobs, table_of_images -%}
+{%- macro report_job(job, h=1) -%}
+    <h{{h}}>{{job.out.outdir | basename | escape}}</h{{h}}>
+    <iframe
+        src="{{job.out.outdir}}/outs/web_summary.html"
+        width="100%"
+        frameborder="0"
+        style="min-height: 80vh"></iframe>
+{%- endmacro -%}
+{%- macro head_job(job) -%}
+    <h1>{{job.out.outdir | basename | escape}}</h1>
+{%- endmacro -%}
+{{ report_jobs(jobs, head_job, report_job) }}

biopipen-0.22.2/biopipen/reports/cellranger/CellRangerVdj.svelte ADDED Viewed

@@ -0,0 +1,16 @@
+{% from "utils/misc.liq" import report_jobs, table_of_images -%}
+{%- macro report_job(job, h=1) -%}
+    <h{{h}}>{{job.out.outdir | basename | escape}}</h{{h}}>
+    <iframe
+        src="{{job.out.outdir}}/outs/web_summary.html"
+        width="100%"
+        frameborder="0"
+        style="min-height: 80vh"></iframe>
+{%- endmacro -%}
+{%- macro head_job(job) -%}
+    <h1>{{job.out.outdir | basename | escape}}</h1>
+{%- endmacro -%}
+{{ report_jobs(jobs, head_job, report_job) }}

biopipen-0.22.2/biopipen/scripts/cellranger/CellRangerCount.py ADDED Viewed

@@ -0,0 +1,79 @@
+import uuid
+import re
+from pathlib import Path
+from biopipen.utils.misc import run_command
+fastqs = {{in.fastqs | repr}}  # pyright: ignore  # noqa
+outdir = {{out.outdir | quote}}  # pyright: ignore
+cellranger = {{envs.cellranger | quote}}  # pyright: ignore
+tmpdir = Path({{envs.tmpdir | quote}})  # pyright: ignore
+ref = {{envs.ref | quote}}  # pyright: ignore
+ncores = {{envs.ncores | int}}  # pyright: ignore
+{% if "id" in envs -%}
+id = {{envs.id | quote}}  # pyright: ignore
+{%- else -%}
+id = {{out.outdir | basename | quote}}  # pyright: ignore
+{%- endif %}
+{% if "sample" in envs -%}
+sample = {{envs.sample | quote}}  # pyright: ignore
+{%- else -%}
+sample = {{out.outdir | basename | quote}}  # pyright: ignore
+{%- endif %}
+# create a temporary unique directory to store the soft-linked fastq files
+fastqdir = tmpdir / f"cellranger_count_{uuid.uuid4()}"
+fastqdir.mkdir(parents=True, exist_ok=True)
+if len(fastqs) == 1 and fastqs[0].is_dir():
+    fastqs = list(fastqs[0].glob("*.fastq.gz"))
+# soft-link the fastq files to the temporary directory
+for fastq in fastqs:
+    fastq = Path(fastq)
+    (fastqdir / fastq.name).symlink_to(fastq)
+other_args = {{envs | dict_to_cli_args: dashify=True, exclude=['cellranger', 'transcriptome', 'ref', 'tmpdir', 'id', 'sample', 'ncores']}}  # pyright: ignore
+command = [
+    cellranger,
+    "count",
+    "--id",
+    id,
+    "--sample",
+    sample,
+    "--fastqs",
+    fastqdir,
+    "--transcriptome",
+    ref,
+    "--localcores",
+    ncores,
+    "--disable-ui",
+    *other_args,
+]
+run_command(command, fg=True, cwd=str(Path(outdir).parent))
+web_summary_html = Path(outdir) / "outs" / "web_summary.html"
+if not web_summary_html.exists():
+    raise RuntimeError(
+        f"web_summary.html does not exist in {outdir}/outs. "
+        "cellranger count failed."
+    )
+# Modify web_summary.html to move javascript to a separate file
+# to void vscode live server breaking the page by injecting some code
+print("# Modify web_summary.html to move javascript to a separate file")
+try:
+    web_summary_js = Path(outdir) / "outs" / "web_summary.js"
+    web_summary_content = web_summary_html.read_text()
+    regex = re.compile(r"<script>(?=/\*! For license)(.+)</script>", re.DOTALL)
+    web_summary_html.write_text(regex.sub(
+        '<script src="web_summary.js"></script>',
+        web_summary_content,
+    ))
+    web_summary_js.write_text(regex.search(web_summary_content).group(1))
+except Exception as e:
+    print(f"Error modifying web_summary.html: {e}")
+    raise e

biopipen-0.22.2/biopipen/scripts/cellranger/CellRangerVdj.py ADDED Viewed

@@ -0,0 +1,79 @@
+import uuid
+import re
+from pathlib import Path
+from biopipen.utils.misc import run_command
+fastqs = {{in.fastqs | repr}}  # pyright: ignore  # noqa
+outdir = {{out.outdir | quote}}  # pyright: ignore
+cellranger = {{envs.cellranger | quote}}  # pyright: ignore
+tmpdir = Path({{envs.tmpdir | quote}})  # pyright: ignore
+ref = {{envs.ref | quote}}  # pyright: ignore
+ncores = {{envs.ncores | int}}  # pyright: ignore
+{% if "id" in envs -%}
+id = {{envs.id | quote}}  # pyright: ignore
+{%- else -%}
+id = {{out.outdir | basename | quote}}  # pyright: ignore
+{%- endif %}
+{% if "sample" in envs -%}
+sample = {{envs.sample | quote}}  # pyright: ignore
+{%- else -%}
+sample = {{out.outdir | basename | quote}}  # pyright: ignore
+{%- endif %}
+# create a temporary unique directory to store the soft-linked fastq files
+fastqdir = tmpdir / f"cellranger_count_{uuid.uuid4()}"
+fastqdir.mkdir(parents=True, exist_ok=True)
+if len(fastqs) == 1 and fastqs[0].is_dir():
+    fastqs = list(fastqs[0].glob("*.fastq.gz"))
+# soft-link the fastq files to the temporary directory
+for fastq in fastqs:
+    fastq = Path(fastq)
+    (fastqdir / fastq.name).symlink_to(fastq)
+other_args = {{envs | dict_to_cli_args: dashify=True, exclude=['cellranger', 'reference', 'ref', 'tmpdir', 'id', 'sample', 'ncores']}}  # pyright: ignore
+command = [
+    cellranger,
+    "vdj",
+    "--id",
+    id,
+    "--sample",
+    sample,
+    "--fastqs",
+    fastqdir,
+    "--reference",
+    ref,
+    "--localcores",
+    ncores,
+    "--disable-ui",
+    *other_args,
+]
+run_command(command, fg=True, cwd=str(Path(outdir).parent))
+web_summary_html = Path(outdir) / "outs" / "web_summary.html"
+if not web_summary_html.exists():
+    raise RuntimeError(
+        f"web_summary.html does not exist in {outdir}/outs. "
+        "cellranger vdj failed."
+    )
+# Modify web_summary.html to move javascript to a separate file
+# to void vscode live server breaking the page by injecting some code
+print("# Modify web_summary.html to move javascript to a separate file")
+try:
+    web_summary_js = Path(outdir) / "outs" / "web_summary.js"
+    web_summary_content = web_summary_html.read_text()
+    regex = re.compile(r"<script>(?=/\*! For license)(.+)</script>", re.DOTALL)
+    web_summary_html.write_text(regex.sub(
+        '<script src="web_summary.js"></script>',
+        web_summary_content,
+    ))
+    web_summary_js.write_text(regex.search(web_summary_content).group(1))
+except Exception as e:
+    print(f"Error modifying web_summary.html: {e}")
+    raise e

biopipen-0.22.2/biopipen/scripts/scrna/CellTypeAnnotation-direct.R ADDED Viewed

@@ -0,0 +1,55 @@
+source("{{biopipen_dir}}/utils/misc.R")
+library(Seurat)
+sobjfile <- {{in.sobjfile | r}}
+outfile <- {{out.outfile | r}}
+celltypes <- {{envs.cell_types | r}}
+newcol <- {{envs.newcol | r}}
+if (is.null(celltypes) || length(celltypes) == 0) {
+    log_warn("No cell types are given!")
+    # create a symbolic link to the input file
+    file.symlink(sobjfile, outfile)
+} else {
+    log_info("Loading Seurat object ...")
+    sobj <- readRDS(sobjfile)
+    idents <- as.character(unique(Idents(sobj)))
+    idents <- idents[order(as.numeric(idents))]
+    if (length(celltypes) < length(idents)) {
+        celltypes <- c(celltypes, idents[(length(celltypes) + 1):length(idents)])
+    } else if (length(celltypes) > length(idents)) {
+        celltypes <- celltypes[1:length(idents)]
+        log_warn("The length of cell types is longer than the number of clusters!")
+    }
+    for (i in seq_along(celltypes)) {
+        if (celltypes[i] == "-" || celltypes[i] == "") {
+            celltypes[i] <- idents[i]
+        }
+    }
+    names(celltypes) <- idents
+    log_info("Renaming cell types ...")
+    if (is.null(newcol)) {
+        has_na <- "NA" %in% unlist(celltypes) || anyNA(unlist(celltypes))
+        sobj$seurat_clusters_id <- Idents(sobj)
+        celltypes$object <- sobj
+        sobj <- do_call(RenameIdents, celltypes)
+        sobj$seurat_clusters <- Idents(sobj)
+        if (has_na) {
+            log_info("Filtering clusters if NA ...")
+            sobj <- subset(
+                sobj,
+                subset = seurat_clusters != "NA" & !is.na(seurat_clusters)
+            )
+        }
+    } else {
+        celltypes$object <- sobj
+        sobj <- do_call(RenameIdents, celltypes)
+        sobj[[newcol]] <- Idents(sobj)
+        Idents(sobj) <- "seurat_clusters"
+    }
+    saveRDS(sobj, outfile)
+}

{biopipen-0.22.0 → biopipen-0.22.2}/biopipen/scripts/scrna/CellsDistribution.R RENAMED Viewed

@@ -142,13 +142,8 @@ do_case <- function(name, case) {
     info <- casename_info(name, create = TRUE)
     cells_by <- trimws(strsplit(case$cells_by, ",")[[1]])
-    sec_case_names <- strsplit(name, ":")[[1]]
-    sec_dir <- file.path(outdir, sec_case_names[1])
-    casename <- paste(sec_case_names[-1], collapse = ":")
-    dir.create(sec_dir, showWarnings = FALSE, recursive = TRUE)
-    outfile <- file.path(info$sec_dir, paste0("case-", info$case_slug, ".png"))
-    txtfile <- file.path(info$sec_dir, paste0("case-", info$case_slug, ".txt"))
+    outfile <- file.path(info$sec_dir, paste0(info$case_slug, ".png"))
+    txtfile <- file.path(info$sec_dir, paste0(info$case_slug, ".txt"))
     # subset the seurat object
     meta <- srtobj@meta.data
@@ -229,14 +224,20 @@ do_case <- function(name, case) {
         meta %>% select(
             !!sym(cells_by),
             !!sym(case$group_by),
+            seurat_clusters,
             CloneSize,
             CloneGroupSize,
             CloneClusterSize,
             CloneGroupClusterSize,
+        ) %>% distinct(
+            !!sym(cells_by),
+            !!sym(case$group_by),
+            seurat_clusters,
+            .keep_all = TRUE
         ),
         txtfile,
         sep = "\t",
-        row.names = TRUE,
+        row.names = FALSE,
         col.names = TRUE,
         quote = FALSE
     )

biopipen 0.22.0__tar.gz → 0.22.2__tar.gz

Potentially problematic release.

biopipen 0.22.0tar.gz → 0.22.2tar.gz