PyPI - biopipen - Versions diffs - 0.32.3__py3-none-any.whl → 0.33.1__py3-none-any.whl - Mend

biopipen 0.32.3py3-none-any.whl → 0.33.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of biopipen might be problematic. Click here for more details.

Files changed (118) hide show

biopipen/__init__.py +1 -1
biopipen/core/config.toml +6 -0
biopipen/core/filters.py +35 -23
biopipen/core/testing.py +6 -1
biopipen/ns/bam.py +39 -0
biopipen/ns/cellranger.py +5 -0
biopipen/ns/cellranger_pipeline.py +2 -2
biopipen/ns/cnvkit_pipeline.py +4 -1
biopipen/ns/delim.py +33 -27
biopipen/ns/protein.py +99 -0
biopipen/ns/scrna.py +428 -250
biopipen/ns/snp.py +16 -3
biopipen/ns/tcr.py +125 -1
biopipen/ns/vcf.py +34 -0
biopipen/ns/web.py +5 -1
biopipen/reports/scrna/SeuratClusterStats.svelte +1 -1
biopipen/reports/scrna/SeuratMap2Ref.svelte +15 -2
biopipen/reports/tcr/ClonalStats.svelte +15 -0
biopipen/reports/utils/misc.liq +20 -7
biopipen/scripts/bam/BamMerge.py +2 -2
biopipen/scripts/bam/BamSampling.py +4 -4
biopipen/scripts/bam/BamSort.py +141 -0
biopipen/scripts/bam/BamSplitChroms.py +10 -10
biopipen/scripts/bam/BamSubsetByBed.py +3 -3
biopipen/scripts/bam/CNVpytor.py +10 -10
biopipen/scripts/bam/ControlFREEC.py +11 -11
biopipen/scripts/bed/Bed2Vcf.py +5 -5
biopipen/scripts/bed/BedConsensus.py +5 -5
biopipen/scripts/bed/BedLiftOver.sh +6 -4
biopipen/scripts/bed/BedtoolsIntersect.py +4 -4
biopipen/scripts/bed/BedtoolsMakeWindows.py +3 -3
biopipen/scripts/bed/BedtoolsMerge.py +4 -4
biopipen/scripts/cellranger/CellRangerCount.py +20 -9
biopipen/scripts/cellranger/CellRangerSummary.R +20 -29
biopipen/scripts/cellranger/CellRangerVdj.py +8 -8
biopipen/scripts/cnvkit/CNVkitAccess.py +6 -6
biopipen/scripts/cnvkit/CNVkitAutobin.py +25 -18
biopipen/scripts/cnvkit/CNVkitBatch.py +5 -5
biopipen/scripts/cnvkit/CNVkitCall.py +3 -3
biopipen/scripts/cnvkit/CNVkitCoverage.py +2 -2
biopipen/scripts/cnvkit/CNVkitDiagram.py +5 -5
biopipen/scripts/cnvkit/CNVkitFix.py +3 -3
biopipen/scripts/cnvkit/CNVkitGuessBaits.py +9 -5
biopipen/scripts/cnvkit/CNVkitHeatmap.py +4 -4
biopipen/scripts/cnvkit/CNVkitReference.py +2 -2
biopipen/scripts/cnvkit/CNVkitScatter.py +5 -5
biopipen/scripts/cnvkit/CNVkitSegment.py +5 -5
biopipen/scripts/cnvkit/guess_baits.py +166 -93
biopipen/scripts/delim/SampleInfo.R +94 -148
biopipen/scripts/misc/Config2File.py +2 -2
biopipen/scripts/misc/Str2File.py +2 -2
biopipen/scripts/protein/MMCIF2PDB.py +33 -0
biopipen/scripts/protein/PDB2Fasta.py +60 -0
biopipen/scripts/protein/Prodigy.py +4 -4
biopipen/scripts/protein/RMSD.py +178 -0
biopipen/scripts/regulatory/MotifScan.py +8 -8
biopipen/scripts/scrna/CellCellCommunication.py +59 -22
biopipen/scripts/scrna/LoomTo10X.R +51 -0
biopipen/scripts/scrna/MarkersFinder.R +273 -654
biopipen/scripts/scrna/RadarPlots.R +73 -53
biopipen/scripts/scrna/SCP-plot.R +15202 -0
biopipen/scripts/scrna/ScVelo.py +0 -0
biopipen/scripts/scrna/SeuratClusterStats-clustree.R +23 -31
biopipen/scripts/scrna/SeuratClusterStats-dimplots.R +26 -54
biopipen/scripts/scrna/SeuratClusterStats-features.R +85 -403
biopipen/scripts/scrna/SeuratClusterStats-ngenes.R +32 -17
biopipen/scripts/scrna/SeuratClusterStats-stats.R +45 -239
biopipen/scripts/scrna/SeuratClusterStats.R +13 -19
biopipen/scripts/scrna/SeuratMap2Ref.R +16 -12
biopipen/scripts/scrna/SeuratPreparing.R +138 -81
biopipen/scripts/scrna/SlingShot.R +71 -0
biopipen/scripts/scrna/celltypist-wrapper.py +7 -6
biopipen/scripts/snp/Plink2GTMat.py +26 -11
biopipen/scripts/snp/PlinkFilter.py +7 -7
biopipen/scripts/snp/PlinkFromVcf.py +8 -5
biopipen/scripts/snp/PlinkSimulation.py +4 -4
biopipen/scripts/snp/PlinkUpdateName.py +4 -4
biopipen/scripts/stats/ChowTest.R +48 -22
biopipen/scripts/tcgamaf/Maf2Vcf.py +2 -2
biopipen/scripts/tcgamaf/MafAddChr.py +2 -2
biopipen/scripts/tcr/ClonalStats.R +484 -0
biopipen/scripts/tcr/ScRepLoading.R +127 -0
biopipen/scripts/tcr/TCRDock.py +10 -6
biopipen/scripts/tcr/vdjtools-patch.sh +1 -1
biopipen/scripts/vcf/BcftoolsAnnotate.py +8 -8
biopipen/scripts/vcf/BcftoolsFilter.py +3 -3
biopipen/scripts/vcf/BcftoolsMerge.py +31 -0
biopipen/scripts/vcf/BcftoolsSort.py +4 -4
biopipen/scripts/vcf/BcftoolsView.py +5 -5
biopipen/scripts/vcf/Vcf2Bed.py +2 -2
biopipen/scripts/vcf/VcfAnno.py +11 -11
biopipen/scripts/vcf/VcfDownSample.sh +22 -10
biopipen/scripts/vcf/VcfFilter.py +5 -5
biopipen/scripts/vcf/VcfFix.py +7 -7
biopipen/scripts/vcf/VcfFix_utils.py +12 -3
biopipen/scripts/vcf/VcfIndex.py +3 -3
biopipen/scripts/vcf/VcfIntersect.py +3 -3
biopipen/scripts/vcf/VcfLiftOver.sh +5 -0
biopipen/scripts/vcf/VcfSplitSamples.py +4 -4
biopipen/scripts/vcf/bcftools_utils.py +3 -3
biopipen/scripts/web/Download.py +8 -4
biopipen/scripts/web/DownloadList.py +5 -5
biopipen/scripts/web/GCloudStorageDownloadBucket.py +5 -5
biopipen/scripts/web/GCloudStorageDownloadFile.py +3 -3
biopipen/scripts/web/gcloud_common.py +1 -1
biopipen/utils/gsea.R +75 -35
biopipen/utils/misc.R +205 -7
biopipen/utils/misc.py +17 -8
biopipen/utils/reference.py +11 -11
biopipen/utils/repr.R +146 -0
biopipen/utils/vcf.py +1 -1
{biopipen-0.32.3.dist-info → biopipen-0.33.1.dist-info}/METADATA +8 -8
{biopipen-0.32.3.dist-info → biopipen-0.33.1.dist-info}/RECORD +115 -105
{biopipen-0.32.3.dist-info → biopipen-0.33.1.dist-info}/WHEEL +1 -1
biopipen/scripts/scrna/SeuratClusterStats-hists.R +0 -144
biopipen/scripts/scrna/SeuratPreparing-common.R +0 -467
biopipen/scripts/scrna/SeuratPreparing-doublet_detection.R +0 -204
{biopipen-0.32.3.dist-info → biopipen-0.33.1.dist-info}/entry_points.txt +0 -0

biopipen/ns/snp.py CHANGED Viewed

@@ -183,7 +183,7 @@ class PlinkFromVcf(Proc):
         vcf_idspace_to: convert all spaces in sample IDs to this character.
         set_missing_var_ids: update variant IDs using a template string,
             with a '@' where the chromosome code should go, and a '#' where the
-            base-pair position belongs. You can also specify `\$r` and `\$a` for
+            base-pair position belongs. You can also specify `\\$r` and `\\$a` for
             the reference and alternate alleles, respectively.
             See <https://www.cog-genomics.org/plink/2.0/data#set_all_var_ids>
         max_alleles (type=int): Maximum number of alleles per variant.
@@ -191,7 +191,7 @@ class PlinkFromVcf(Proc):
             Note that `_` will be replaced by `-` in the argument names.
     """  # noqa: E501
     input = "invcf:file"
-    output = "outdir:dir:{{in.invcf | regex_replace: '\\.gz$', '' | stem}}"
+    output = "outdir:dir:{{in.invcf.stem | regex_replace: '\\.gz$', ''}}"
     lang = config.lang.python
     envs = {
         "plink": config.exe.plink2,
@@ -217,7 +217,14 @@ class Plink2GTMat(Proc):
     The allelic dosage is used as the values of genotype matrix.
     "--keep-allele-order" is used to keep the allele order consistent with the
-    reference allele first.
+    reference allele first. This way, the genotype of homozygous reference alleles
+    will be encoded as 2, heterozygous as 1, and homozygous alternate alleles as 0.
+    This is the PLINK dosage encoding. If you want to use this encoding, you can
+    set `envs.gtcoding` to `plink`. Otherwise, the default encoding is `vcf`, which
+    will encode the genotype as 0, 1, and 2 for homozygous reference, heterozygous,
+    and homozygous alternate alleles, respectively.
+    Note that `envs.gtcoding = "vcf"` only works for biallelic variants for now.
     Input:
         indir: Input directory containing the PLINK files.
@@ -241,6 +248,11 @@ class Plink2GTMat(Proc):
             respectively.
         trans_chr: A dictionary to translate chromosome numbers to chromosome names.
         missing_id: what to use as the rs if missing.
+        gtcoding (choice): The genotype coding to use.
+            - vcf: 0/1/2 for homozygous reference, heterozygous, and homozygous
+                alternate alleles, respectively.
+            - plink: 2/1/0 for homozygous reference, heterozygous, and homozygous
+                alternate alleles, respectively.
     """
     input = "indir:dir"
     output = "outfile:file:{{in.indir | stem}}-gtmat.txt"
@@ -253,6 +265,7 @@ class Plink2GTMat(Proc):
         "varid": "{chr}_{pos}_{varid}_{ref}_{alt}",
         "trans_chr": {"23": "X", "24": "Y", "25": "XY", "26": "M"},
         "missing_id": "NA",
+        "gtcoding": "vcf",
     }
     script = "file://../scripts/snp/Plink2GTMat.py"

biopipen/ns/tcr.py CHANGED Viewed

@@ -39,7 +39,8 @@ class ImmunarchLoading(Proc):
             information.
     Output:
-        rdsfile: The RDS file with the data and metadata
+        rdsfile: The RDS file with the data and metadata, which can be processed by
+            other `immunarch` functions.
         metatxt: The meta data at cell level, which can be used to attach to the Seurat object
     Envs:
@@ -1675,3 +1676,126 @@ class TCRDock(Proc):
         "data_dir": None,
     }
     script = "file://../scripts/tcr/TCRDock.py"
+class ScRepLoading(Proc):
+    """Load the single cell TCR/BCR data into a `scRepertoire` compatible object
+    This process loads the single cell TCR/BCR data into a `scRepertoire`
+    compatible object. Later, `scRepertoire::combineExpression` can be used to
+    combine the expression data with the TCR/BCR data.
+    For the data path specified at `TCRData` in the input file, we will first find
+    `filtered_contig_annotations.csv` and `filtered_config_annotations.csv.gz` in the
+    path. If neighter of them exists, we will find `all_contig_annotations.csv` and
+    `all_contig_annotations.csv.gz` in the path and a warning will be raised
+    (You can find it at `./.pipen/<pipeline-name>/ImmunarchLoading/<job.index>/job.stderr`).
+    If none of the files exists, an error will be raised.
+    Input:
+        metafile: The meta data of the samples
+            A tab-delimited file
+            Two columns are required:
+            * `Sample` to specify the sample names.
+            * `TCRData` to assign the path of the data to the samples,
+            and this column will be excluded as metadata.
+            Immunarch is able to fetch the sample names from the names of
+            the target files. However, 10x data yields result like
+            `filtered_contig_annotations.csv`, which doesn't have any name
+            information.
+    Output:
+        outfile: The `scRepertoire` compatible object in RDS format
+    Envs:
+        combineTCR (type=json): The extra arguments for `scRepertoire::combineTCR` function.
+            See also <https://www.borch.dev/uploads/screpertoire/reference/combinetcr>
+        exclude (auto): The columns to exclude from the metadata to add to the object.
+            A list of column names to exclude or a string with column names separated by `,`.
+            By default, `TCRData` and `RNAData` will be excluded.
+    """  # noqa: E501
+    input = "metafile:file"
+    output = "outfile:file:{{in.metafile | stem}}.scRep.RDS"
+    lang = config.lang.rscript
+    envs = {"combineTCR": {"samples": True}, "exclude": ["TCRData", "RNAData"]}
+    script = "file://../scripts/tcr/ScRepLoading.R"
+class ClonalStats(Proc):
+    """Visualize the clonal information.
+    Using [`scplotter`](https://github.com/pwwang/scplotter) to visualize the clonal
+    information.
+    Input:
+        screpfile: The `scRepertoire` object in RDS format
+    Output:
+        outdir: The output directory containing the plots
+    Envs:
+        mutaters (type=json;order=-9): The mutaters passed to `dplyr::mutate()` to add new variables.
+            When the object loaded form `in.screpfile` is a list, the mutaters will be applied to each element.
+            The keys are the names of the new variables, and the values are the expressions.
+            When it is a `Seurat` object, typically an output of `scRepertoire::combineExpression()`,
+            the mutaters will be applied to the `meta.data`.
+        viz_type (choice): The type of visualization to generate.
+            - volume: The volume of the clones using [`ClonalVolumePlot`](https://pwwang.github.io/scplotter/reference/ClonalVolumePlot.html)
+            - abundance: The abundance of the clones using [`ClonalAbundancePlot`](https://pwwang.github.io/scplotter/reference/ClonalAbundancePlot.html)
+            - length: The length of the CDR3 sequences using [`ClonalLengthPlot`](https://pwwang.github.io/scplotter/reference/ClonalLengthPlot.html)
+            - residency: The residency of the clones using [`ClonalResidencyPlot`](https://pwwang.github.io/scplotter/reference/ClonalResidencyPlot.html)
+            - dynamics: The dynamics of the clones using [`ClonalDynamicsPlot`](https://pwwang.github.io/scplotter/reference/ClonalDynamicsPlot.html)
+            - composition: The composition of the clones using [`ClonalCompositionPlot`](https://pwwang.github.io/scplotter/reference/ClonalCompositionPlot.html)
+            - overlap: The overlap of the clones using [`ClonalOverlapPlot`](https://pwwang.github.io/scplotter/reference/ClonalOverlapPlot.html)
+            - diversity: The diversity of the clones using [`ClonalDiversityPlot`](https://pwwang.github.io/scplotter/reference/ClonalDiversityPlot.html)
+            - geneusage: The gene usage of the clones using [`ClonalGeneUsagePlot`](https://pwwang.github.io/scplotter/reference/ClonalGeneUsagePlot.html)
+            - positional: The positional information of the clones using [`ClonalPositionalPlot`](https://pwwang.github.io/scplotter/reference/ClonalPositionalPlot.html)
+            - kmer: The kmer information of the clones using [`ClonalKmerPlot`](https://pwwang.github.io/scplotter/reference/ClonalKmerPlot.html)
+            - rarefaction: The rarefaction curve of the clones using [`ClonalRarefactionPlot`](https://pwwang.github.io/scplotter/reference/ClonalRarefactionPlot.html)
+        subset: An expression to subset the data before plotting.
+            Similar to `mutaters`, it will be applied to each element by `dplyr::filter()` if the object
+            loaded form `in.screpfile` is a list; otherwise, it will be applied to
+            `subset(sobj, subset = <expr>)` if the object is a `Seurat` object.
+        devpars (ns): The parameters for the plotting device.
+            - width (type=int): The width of the device
+            - height (type=int): The height of the device
+            - res (type=int): The resolution of the device
+        more_formats (list): The extra formats to save the plots in, other than PNG.
+        save_code (flag): Whether to save the code used to generate the plots
+            Note that the data directly used to generate the plots will also be saved in an `rda` file.
+            Be careful if the data is large as it may take a lot of disk space.
+        descr: The description of the plot, used to show in the report.
+        <more>: The arguments for the plot function
+            See the documentation of the corresponding plot function for the details
+        cases (type=json): The cases to generate the plots if we have multiple cases.
+            The keys are the names of the cases, and the values are the arguments for the plot function.
+            The arguments in `envs` will be used if not specified in `cases`, except for `mutaters`.
+            Sections can be specified as the prefix of the case name, separated by `::`.
+            For example, if you have a case named `Clonal Volume::Case1`, the plot will be put in the
+            section `Clonal Volume`. By default, when there are multiple cases for the same 'viz_type', the name of the 'viz_type' will be used
+            as the default section name (for example, when 'viz_type' is 'volume', the section name will be 'Clonal Volume').
+            When there is only a single case, the section name will default to 'DEFAULT', which will not be shown
+            in the report.
+    """  # noqa: E501
+    input = "screpfile:file"
+    output = "outdir:dir:{{in.screpfile | stem}}.clonalstats"
+    lang = config.lang.rscript
+    envs = {
+        "mutaters": {},
+        "subset": None,
+        "viz_type": None,
+        "devpars": {"width": None, "height": None, "res": 100},
+        "more_formats": [],
+        "save_code": False,
+        "descr": None,
+        "cases": {
+            "Clonal Volume": {"viz_type": "volume"},
+            "Clonal Abundance": {"viz_type": "abundance"},
+            "CDR3 Length": {"viz_type": "length"},
+            "Clonal Diversity": {"viz_type": "diversity"},
+        }
+    }
+    script = "file://../scripts/tcr/ClonalStats.R"
+    plugin_opts = {"report": "file://../reports/tcr/ClonalStats.svelte"}

biopipen/ns/vcf.py CHANGED Viewed

@@ -595,6 +595,40 @@ class BcftoolsSort(Proc):
     script = "file://../scripts/vcf/BcftoolsSort.py"
+class BcftoolsMerge(Proc):
+    """Merge multiple VCF files using `bcftools merge`.
+    Input:
+        infiles: The input VCF files
+    Output:
+        outfile: The merged VCF file.
+    Envs:
+        bcftools: Path to bcftools
+        tabix: Path to tabix, used to index infile/outfile
+        ncores (type=int): Number of cores (`--threads`) to use
+        gz (flag): Whether to gzip the output file
+        index (flag): Whether to index the output file (tbi) (`envs.gz` forced to True)
+        <more>: Other arguments for `bcftools merge`.
+            See also <https://samtools.github.io/bcftools/bcftools.html#merge>
+    """
+    input = "infiles:files"
+    output = (
+        "outfile:file:{{in.infiles | first | stem | append: '_etc_merged'}}.vcf"
+        "{{'.gz' if envs.index or envs.gz else ''}}"
+    )
+    lang = config.lang.python
+    envs = {
+        "bcftools": config.exe.bcftools,
+        "tabix": config.exe.tabix,
+        "ncores": config.misc.ncores,
+        "gz": True,
+        "index": True,
+    }
+    script = "file://../scripts/vcf/BcftoolsMerge.py"
 class BcftoolsView(Proc):
     """View, subset and filter VCF files by position and filtering expression.

biopipen/ns/web.py CHANGED Viewed

@@ -32,7 +32,11 @@ class Download(Proc):
     input = "url"
     output = (
         "outfile:file:"
-        "{{in.url | basename | replace: '%2E', '.' | slugify: separator='.'}}"
+        """{{in.url
+            | basename
+            | url_decode
+            | slugify: separator='.', lowercase=False, regex_pattern='[^-a-zA-Z0-9_]+'
+        }}"""
     )
     lang = config.lang.python
     envs = {

biopipen/reports/scrna/SeuratClusterStats.svelte CHANGED Viewed

@@ -1,7 +1,7 @@
 {% from "utils/misc.liq" import report_jobs, table_of_images -%}
 {% from_ os import path %}
 <script>
-    import { DataTable, Image, Descr } from "$libs";
+    import { DataTable, Image, Descr, Plotly } from "$libs";
     import { Tabs, Tab, TabContent } from "$ccs";
 </script>

biopipen/reports/scrna/SeuratMap2Ref.svelte CHANGED Viewed

@@ -6,8 +6,21 @@
 {%- macro report_job(job, h=1) -%}
 <h{{h}}>UMAPs</h{{h}}>
-{% set imgs = job.outdir | glob: "UMAPs-*.png" %}
-{{ table_of_images(imgs) }}
+{% set imgs = [] %}
+{% set caps = [] %}
+{% for png in job.outdir | glob: "UMAPs-*.png" %}
+    {% set pdf = png | regex_replace: "\\.png$", ".pdf" %}
+    {% set stm = png | stem %}
+    {% set _ = imgs.append({"src": png, "download": pdf}) %}
+    {% set _ = caps.append(stm | replace: "UMAPs-", "") %}
+{% endfor %}
+{{ table_of_images(imgs, caps) }}
+<h{{h}}>Mapping Score</h{{h}}>
+<Image
+    src="{{job.outdir | joinpath: 'mapping_score.png'}}"
+    download="{{job.outdir | joinpath: 'mapping_score.pdf'}}"
+    />
 <h{{h}}>Stats</h{{h}}>
 {% for stfile in job.outdir | glob: "stats-*.txt" %}

biopipen/reports/tcr/ClonalStats.svelte ADDED Viewed

@@ -0,0 +1,15 @@
+{% from "utils/misc.liq" import report_jobs, table_of_images -%}
+<script>
+    import { Image, DataTable, Descr } from "$libs";
+</script>
+{%- macro report_job(job, h=1) -%}
+    {{ job | render_job: h=h }}
+{%- endmacro -%}
+{%- macro head_job(job) -%}
+    <h1>{{job.in.screpfile | stem | escape }}</h1>
+{%- endmacro -%}
+{{ report_jobs(jobs, head_job, report_job) }}

biopipen/reports/utils/misc.liq CHANGED Viewed

@@ -25,7 +25,14 @@ import { Image } from "$libs";
 {% for batch_srcs in srcs | batch: col, "" %}
 {% set outer_loop = loop %}
 <tr>
-    {% for src in batch_srcs %}
+    {% for srcinfo in batch_srcs %}
+        {% if srcinfo | isinstance: str %}
+            {% set src = srcinfo %}
+            {% set download = None %}
+        {% else %}
+            {% set src = srcinfo['src'] %}
+            {% set download = srcinfo.get('download', None) %}
+        {% endif %}
         {% set i = col * outer_loop.index0 + loop.index0 %}
         {% if i >= len(srcs) %}
             <td style="width: {{table_width / col}}%"></td>
@@ -33,21 +40,27 @@ import { Image } from "$libs";
         <td style="width: {{table_width / col}}%; vertical-align:top;">
             {% if caps is none %}
             <div
-                style="padding-left: 28px; font-weight: bold; padding-top: 10px; margin-bottom: -10px;">
+                style="padding-left: 28px; font-weight: bold; padding-top: 16px;">
                 {{ src | stem }}
             </div>
             {% elif caps is false %}
             {% else %}
             <div
-                style="padding-left: 28px; font-weight: bold; padding-top: 10px; margin-bottom: -10px;">
+                style="padding-left: 28px; font-weight: bold; padding-top: 16px;">
                 {{ caps[i] }}
             </div>
             {% endif %}
-            {% if src | replace: ".png", ".pdf" | exists %}
-                <Image style="max-width: 90%" src={{src | quote}}
-                    download={{src | replace: ".png", ".pdf" | quote}} />
+            {% if download %}
+                <Image
+                    style="max-width: 90%"
+                    src={{src | quote}}
+                    download={ {{download | json}} }
+                />
             {% else %}
-                <Image style="max-width: 90%" src={{src | quote}} />
+                <Image
+                    style="max-width: 90%"
+                    src={{src | quote}}
+                />
             {% endif %}
         </td>
         {% endif %}

biopipen/scripts/bam/BamMerge.py CHANGED Viewed

@@ -1,8 +1,8 @@
 from pathlib import Path
 from biopipen.utils.misc import run_command, logger
-bamfiles = {{in.bamfiles | repr}}  # pyright: ignore # noqa
-outfile = Path({{out.outfile | repr}})  # pyright: ignore
+bamfiles = {{in.bamfiles | default: [] | each: str}}  # pyright: ignore # noqa
+outfile = Path({{out.outfile | quote}})  # pyright: ignore
 ncores = {{envs.ncores | int}}  # pyright: ignore
 tool = {{envs.tool | quote}}  # pyright: ignore
 samtools = {{envs.samtools | quote}}  # pyright: ignore

biopipen/scripts/bam/BamSampling.py CHANGED Viewed

@@ -4,12 +4,12 @@ from biopipen.utils.misc import run_command, logger
 # using:
 # samtools view --subsample 0.1 --subsample-seed 1234 --threads 4 -b -o out.bam in.bam
-bamfile = {{ in.bamfile | repr }} # pyright: ignore # noqa
-outfile = Path({{ out.outfile | repr }}) # pyright: ignore
+bamfile = {{ in.bamfile | quote }} # pyright: ignore # noqa
+outfile = Path({{ out.outfile | quote }}) # pyright: ignore
 ncores = {{ envs.ncores | int }} # pyright: ignore
 samtools = {{ envs.samtools | repr }} # pyright: ignore
 tool = {{ envs.tool | repr }} # pyright: ignore
-fraction = {{ envs.fraction | repr }} # pyright: ignore
+fraction: float = {{ envs.fraction | repr }} # pyright: ignore
 seed = {{ envs.seed | int }} # pyright: ignore
 should_index = {{ envs.index | repr }} # pyright: ignore
 should_sort = {{ envs.sort | repr }} # pyright: ignore
@@ -38,7 +38,7 @@ if fraction > 1:
         "-c",
         bamfile
     ]
-    nreads = run_command(cmd, stdout="return").strip()
+    nreads = run_command(cmd, stdout="return").strip()  # type: ignore
     fraction = fraction / float(int(nreads))
 ofile = (

biopipen/scripts/bam/BamSort.py ADDED Viewed

@@ -0,0 +1,141 @@
+from hashlib import md5
+from pathlib import Path
+from biopipen.utils.misc import run_command, dict_to_cli_args
+infile: str = {{ in.bamfile | quote }} # pyright: ignore # noqa
+outfile = Path({{ out.outfile | quote }}) # pyright: ignore
+args: dict = {{ envs | dict | repr }} # pyright: ignore
+ncores = args.pop("ncores")
+tool = args.pop("tool")
+samtools = args.pop("samtools")
+sambamba = args.pop("sambamba")
+tmpdir = args.pop("tmpdir")
+byname = args.pop("byname")
+should_index = args.pop("index")
+sig = md5(infile.encode()).hexdigest()
+tmpdir = Path(tmpdir).joinpath(
+    f"biopipen_BamSort_{{job.index}}_{sig}_{Path(infile).name}"
+)
+tmpdir.mkdir(parents=True, exist_ok=True)
+tmpdir = str(tmpdir)
+def use_samtools():
+    """Use samtools to sort/index bam file.
+    Usage: samtools sort [options...] [in.bam]
+    Options:
+    -l INT     Set compression level, from 0 (uncompressed) to 9 (best)
+    -u         Output uncompressed data (equivalent to -l 0)
+    -m INT     Set maximum memory per thread; suffix K/M/G recognized [768M]
+    -M         Use minimiser for clustering unaligned/unplaced reads
+    -K INT     Kmer size to use for minimiser [20]
+    -n         Sort by read name (not compatible with samtools index command)
+    -t TAG     Sort by value of TAG. Uses position as secondary index (or read name if -n is set)
+    -o FILE    Write final output to FILE rather than standard output
+    -T PREFIX  Write temporary files to PREFIX.nnnn.bam
+        --no-PG
+                Do not add a PG line
+        --template-coordinate
+                Sort by template-coordinate
+        --input-fmt-option OPT[=VAL]
+                Specify a single input file format option in the form
+                of OPTION or OPTION=VALUE
+    -O, --output-fmt FORMAT[,OPT[=VAL]]...
+                Specify output format (SAM, BAM, CRAM)
+        --output-fmt-option OPT[=VAL]
+                Specify a single output file format option in the form
+                of OPTION or OPTION=VALUE
+        --reference FILE
+                Reference sequence FASTA FILE [null]
+    -@, --threads INT
+                Number of additional threads to use [0]
+        --write-index
+                Automatically index the output files [off]
+        --verbosity INT
+                Set level of verbosity
+    """  # noqa
+    sargs = args.copy()
+    sargs["n"] = byname
+    sargs["T"] = f"{tmpdir}/tmp"
+    sargs["threads"] = ncores
+    if should_index:
+        sargs["write-index"] = True
+        # https://github.com/samtools/samtools/issues/1196
+        sargs["o"] = f"{outfile}##idx##{outfile}.bai"
+    else:
+        sargs["o"] = outfile
+    n_outfmt = sum(["O" in sargs, "output-fmt" in sargs])
+    if n_outfmt > 1:
+        raise ValueError(
+            "envs.args cannot contain both 'O' and 'output-fmt'"
+        )
+    if n_outfmt == 0:
+        sargs["O"] = "BAM"
+    cmd = [
+        samtools,
+        "sort",
+        *dict_to_cli_args(sargs),
+        infile,
+    ]
+    run_command(cmd)
+def use_sambamba():
+    """Use sambamba to sort/index bam file.
+    sambamba 0.8.2
+    by Artem Tarasov and Pjotr Prins (C) 2012-2021
+        LDC 1.28.1 / DMD v2.098.1 / LLVM12.0.0 / bootstrap LDC - the LLVM D compiler (1.28.1)
+    Usage: sambamba-sort [options] <input.bam>
+    Options: -m, --memory-limit=LIMIT
+                approximate total memory limit for all threads (by default 2GB)
+            --tmpdir=TMPDIR
+                directory for storing intermediate files; default is system directory for temporary files
+            -o, --out=OUTPUTFILE
+                output file name; if not provided, the result is written to a file with .sorted.bam extension
+            -n, --sort-by-name
+                sort by read name instead of coordinate (lexicographical order)
+            --sort-picard
+                sort by query name like in picard
+            -N, --natural-sort
+                sort by read name instead of coordinate (so-called 'natural' sort as in samtools)
+            -M, --match-mates
+                pull mates of the same alignment together when sorting by read name
+            -l, --compression-level=COMPRESSION_LEVEL
+                level of compression for sorted BAM, from 0 to 9
+            -u, --uncompressed-chunks
+                write sorted chunks as uncompressed BAM (default is writing with compression level 1), that might be faster in some cases but uses more disk space
+            -p, --show-progress
+                show progressbar in STDERR
+            -t, --nthreads=NTHREADS
+                use specified number of threads
+            -F, --filter=FILTER
+                keep only reads that satisfy FILTER
+    """  # noqa
+    sargs = args.copy()
+    sargs["nthreads"] = ncores
+    sargs["n"] = byname
+    sargs["tmpdir"] = tmpdir
+    sargs["o"] = outfile
+    cmd = [
+        sambamba,
+        "sort",
+        *dict_to_cli_args(sargs, sep="="),
+        infile,
+    ]
+    run_command(cmd)
+if __name__ == "__main__":
+    if tool == "samtools":
+        use_samtools()
+    elif tool == "sambamba":
+        use_sambamba()
+    else:
+        raise ValueError(f"Unknown tool: {tool}")

biopipen/scripts/bam/BamSplitChroms.py CHANGED Viewed

@@ -2,12 +2,12 @@ from pathlib import Path
 from biopipen.utils.misc import run_command
 from biopipen.utils.reference import bam_index
-bamfile = {{in.bamfile | quote}}  # pyright: ignore
-outdir = {{out.outdir | quote}}  # pyright: ignore
-tool = {{envs.tool | quote}}  # pyright: ignore
-samtools = {{envs.samtools | quote}}  # pyright: ignore
-sambamba = {{envs.sambamba | quote}}  # pyright: ignore
-ncores = {{envs.ncores | repr}}  # pyright: ignore
+bamfile: str = {{in.bamfile | quote}}  # pyright: ignore  # noqa
+outdir: str = {{out.outdir | quote}}  # pyright: ignore
+tool: str = {{envs.tool | quote}}  # pyright: ignore
+samtools: str = {{envs.samtools | quote}}  # pyright: ignore
+sambamba: str = {{envs.sambamba | quote}}  # pyright: ignore
+ncores: int = {{envs.ncores | repr}}  # pyright: ignore
 keep_other_sq = {{envs.keep_other_sq | repr}}  # pyright: ignore
 chroms_to_keep = {{envs.chroms | repr}}  # pyright: ignore
 should_index = {{envs.index | bool}}  # pyright: ignore
@@ -17,13 +17,13 @@ def _remove_other_sq(infile, chrom, outfile):
     exe = samtools if tool == "samtools" else sambamba
     print("\nRemoving other chromosomes in @SQ in header")
     header_cmd = [exe, "view", "-H", infile]
-    header_p = run_command(
+    header_p = run_command(  # type: ignore
         header_cmd,
         stdout=True,
         wait=False,
         print_command=True,
     )
-    header = header_p.stdout.read().decode().strip().splitlines()
+    header = header_p.stdout.read().decode().strip().splitlines()  # type: ignore
     new_header = []
     for line in header:
         if line.startswith("@SQ"):
@@ -63,7 +63,7 @@ def use_samtools():
             "| grep '^@SQ' | cut -f 2 | cut -d ':' -f 2"
         )
         p = run_command(cmd, stdout=True, wait=False)
-        chroms = p.stdout.read().decode().strip().splitlines()
+        chroms = p.stdout.read().decode().strip().splitlines()  # type: ignore
     else:
         print("\nUsing provided chromosomes")
         chroms = chroms_to_keep
@@ -121,7 +121,7 @@ def use_sambamba():
             "| grep '^@SQ' | cut -f 2 | cut -d ':' -f 2"
         )
         p = run_command(cmd, stdout=True, wait=False)
-        chroms = p.stdout.read().decode().splitlines()
+        chroms = p.stdout.read().decode().splitlines()  # type: ignore
     else:
         print("\nUsing provided chromosomes")
         chroms = chroms_to_keep

biopipen/scripts/bam/BamSubsetByBed.py CHANGED Viewed

@@ -4,9 +4,9 @@ from biopipen.utils.misc import run_command, logger
 # using:
 # samtools view --subsample 0.1 --subsample-seed 1234 --threads 4 -b -o out.bam in.bam
-bamfile = {{ in.bamfile | repr }} # pyright: ignore # noqa
-bedfile = {{ in.bedfile | repr }} # pyright: ignore # noqa
-outfile = Path({{ out.outfile | repr }}) # pyright: ignore
+bamfile = {{ in.bamfile | quote }} # pyright: ignore # noqa
+bedfile = {{ in.bedfile | quote }} # pyright: ignore # noqa
+outfile = Path({{ out.outfile | quote }}) # pyright: ignore
 ncores = {{ envs.ncores | int }} # pyright: ignore
 samtools = {{ envs.samtools | repr }} # pyright: ignore
 tool = {{ envs.tool | repr }} # pyright: ignore

biopipen/scripts/bam/CNVpytor.py CHANGED Viewed

@@ -6,17 +6,17 @@ from datetime import datetime
 from biopipen.utils.reference import bam_index
 from biopipen.utils.misc import run_command, dict_to_cli_args, logger
-bamfile = {{in.bamfile | quote}}  # pyright: ignore # noqa
-snpfile = {{in.snpfile | repr}}  # pyright: ignore
+bamfile: str = {{in.bamfile | quote}}  # pyright: ignore # noqa
+snpfile: str = {{in.snpfile | quote}}  # pyright: ignore
 outdir = Path({{out.outdir | quote}})  # pyright: ignore
-cnvpytor = {{envs.cnvpytor | quote}}  # pyright: ignore
-samtools = {{envs.samtools | quote}}  # pyright: ignore
-ncores = {{envs.ncores | int}}  # pyright: ignore
+cnvpytor: str = {{envs.cnvpytor | quote}}  # pyright: ignore
+samtools: str = {{envs.samtools | quote}}  # pyright: ignore
+ncores: int = {{envs.ncores | int}}  # pyright: ignore
 refdir = {{envs.refdir | quote}}  # pyright: ignore
 genome = {{envs.genome | quote}}  # pyright: ignore
-chrsize = {{envs.chrsize | quote}}  # pyright: ignore
-filters = {{envs.filters | repr}}  # pyright: ignore
-args = {{envs | repr}}  # pyright: ignore
+chrsize: str = {{envs.chrsize | quote}}  # pyright: ignore
+filters: dict = {{envs.filters | repr}}  # pyright: ignore
+args: dict = {{envs | dict}}  # pyright: ignore
 del args['cnvpytor']
 del args['ncores']
@@ -27,7 +27,7 @@ del args['chrsize']
 del args['filters']
-bamfile = bam_index(bamfile, outdir, samtools, ncores)
+bamfile: Path = bam_index(bamfile, str(outdir), samtools, ncores=ncores)
 NOSNP_COLS = [
     "CNVtype",
@@ -293,7 +293,7 @@ def cnvpytor2vcf(infile, snp):
         fout.write('##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n')
         fout.write('##FORMAT=<ID=CN,Number=1,Type=Integer,Description="Copy number genotype for imprecise events">\n')
         fout.write('##FORMAT=<ID=PE,Number=1,Type=String,Description="Number of paired-ends that support the event">\n')
-        fout.write(f"#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t{Path(bamfile).stem}\n")
+        fout.write(f"#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t{bamfile.stem}\n")
         prev_chrom, chrom_seq, count = "", "", 0
         for line in fin:
             # type, coor, length, rd, p1, p2, p3, p4, q0, pe = line.strip("\n").split()

biopipen 0.32.3__py3-none-any.whl → 0.33.1__py3-none-any.whl

Potentially problematic release.

biopipen 0.32.3py3-none-any.whl → 0.33.1py3-none-any.whl