biopipen 0.32.3__py3-none-any.whl → 0.33.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of biopipen might be problematic. Click here for more details.

Files changed (118) hide show
  1. biopipen/__init__.py +1 -1
  2. biopipen/core/config.toml +6 -0
  3. biopipen/core/filters.py +35 -23
  4. biopipen/core/testing.py +6 -1
  5. biopipen/ns/bam.py +39 -0
  6. biopipen/ns/cellranger.py +5 -0
  7. biopipen/ns/cellranger_pipeline.py +2 -2
  8. biopipen/ns/cnvkit_pipeline.py +4 -1
  9. biopipen/ns/delim.py +33 -27
  10. biopipen/ns/protein.py +99 -0
  11. biopipen/ns/scrna.py +428 -250
  12. biopipen/ns/snp.py +16 -3
  13. biopipen/ns/tcr.py +125 -1
  14. biopipen/ns/vcf.py +34 -0
  15. biopipen/ns/web.py +5 -1
  16. biopipen/reports/scrna/SeuratClusterStats.svelte +1 -1
  17. biopipen/reports/scrna/SeuratMap2Ref.svelte +15 -2
  18. biopipen/reports/tcr/ClonalStats.svelte +15 -0
  19. biopipen/reports/utils/misc.liq +20 -7
  20. biopipen/scripts/bam/BamMerge.py +2 -2
  21. biopipen/scripts/bam/BamSampling.py +4 -4
  22. biopipen/scripts/bam/BamSort.py +141 -0
  23. biopipen/scripts/bam/BamSplitChroms.py +10 -10
  24. biopipen/scripts/bam/BamSubsetByBed.py +3 -3
  25. biopipen/scripts/bam/CNVpytor.py +10 -10
  26. biopipen/scripts/bam/ControlFREEC.py +11 -11
  27. biopipen/scripts/bed/Bed2Vcf.py +5 -5
  28. biopipen/scripts/bed/BedConsensus.py +5 -5
  29. biopipen/scripts/bed/BedLiftOver.sh +6 -4
  30. biopipen/scripts/bed/BedtoolsIntersect.py +4 -4
  31. biopipen/scripts/bed/BedtoolsMakeWindows.py +3 -3
  32. biopipen/scripts/bed/BedtoolsMerge.py +4 -4
  33. biopipen/scripts/cellranger/CellRangerCount.py +20 -9
  34. biopipen/scripts/cellranger/CellRangerSummary.R +20 -29
  35. biopipen/scripts/cellranger/CellRangerVdj.py +8 -8
  36. biopipen/scripts/cnvkit/CNVkitAccess.py +6 -6
  37. biopipen/scripts/cnvkit/CNVkitAutobin.py +25 -18
  38. biopipen/scripts/cnvkit/CNVkitBatch.py +5 -5
  39. biopipen/scripts/cnvkit/CNVkitCall.py +3 -3
  40. biopipen/scripts/cnvkit/CNVkitCoverage.py +2 -2
  41. biopipen/scripts/cnvkit/CNVkitDiagram.py +5 -5
  42. biopipen/scripts/cnvkit/CNVkitFix.py +3 -3
  43. biopipen/scripts/cnvkit/CNVkitGuessBaits.py +9 -5
  44. biopipen/scripts/cnvkit/CNVkitHeatmap.py +4 -4
  45. biopipen/scripts/cnvkit/CNVkitReference.py +2 -2
  46. biopipen/scripts/cnvkit/CNVkitScatter.py +5 -5
  47. biopipen/scripts/cnvkit/CNVkitSegment.py +5 -5
  48. biopipen/scripts/cnvkit/guess_baits.py +166 -93
  49. biopipen/scripts/delim/SampleInfo.R +94 -148
  50. biopipen/scripts/misc/Config2File.py +2 -2
  51. biopipen/scripts/misc/Str2File.py +2 -2
  52. biopipen/scripts/protein/MMCIF2PDB.py +33 -0
  53. biopipen/scripts/protein/PDB2Fasta.py +60 -0
  54. biopipen/scripts/protein/Prodigy.py +4 -4
  55. biopipen/scripts/protein/RMSD.py +178 -0
  56. biopipen/scripts/regulatory/MotifScan.py +8 -8
  57. biopipen/scripts/scrna/CellCellCommunication.py +59 -22
  58. biopipen/scripts/scrna/LoomTo10X.R +51 -0
  59. biopipen/scripts/scrna/MarkersFinder.R +273 -654
  60. biopipen/scripts/scrna/RadarPlots.R +73 -53
  61. biopipen/scripts/scrna/SCP-plot.R +15202 -0
  62. biopipen/scripts/scrna/ScVelo.py +0 -0
  63. biopipen/scripts/scrna/SeuratClusterStats-clustree.R +23 -31
  64. biopipen/scripts/scrna/SeuratClusterStats-dimplots.R +26 -54
  65. biopipen/scripts/scrna/SeuratClusterStats-features.R +85 -403
  66. biopipen/scripts/scrna/SeuratClusterStats-ngenes.R +32 -17
  67. biopipen/scripts/scrna/SeuratClusterStats-stats.R +45 -239
  68. biopipen/scripts/scrna/SeuratClusterStats.R +13 -19
  69. biopipen/scripts/scrna/SeuratMap2Ref.R +16 -12
  70. biopipen/scripts/scrna/SeuratPreparing.R +138 -81
  71. biopipen/scripts/scrna/SlingShot.R +71 -0
  72. biopipen/scripts/scrna/celltypist-wrapper.py +7 -6
  73. biopipen/scripts/snp/Plink2GTMat.py +26 -11
  74. biopipen/scripts/snp/PlinkFilter.py +7 -7
  75. biopipen/scripts/snp/PlinkFromVcf.py +8 -5
  76. biopipen/scripts/snp/PlinkSimulation.py +4 -4
  77. biopipen/scripts/snp/PlinkUpdateName.py +4 -4
  78. biopipen/scripts/stats/ChowTest.R +48 -22
  79. biopipen/scripts/tcgamaf/Maf2Vcf.py +2 -2
  80. biopipen/scripts/tcgamaf/MafAddChr.py +2 -2
  81. biopipen/scripts/tcr/ClonalStats.R +484 -0
  82. biopipen/scripts/tcr/ScRepLoading.R +127 -0
  83. biopipen/scripts/tcr/TCRDock.py +10 -6
  84. biopipen/scripts/tcr/vdjtools-patch.sh +1 -1
  85. biopipen/scripts/vcf/BcftoolsAnnotate.py +8 -8
  86. biopipen/scripts/vcf/BcftoolsFilter.py +3 -3
  87. biopipen/scripts/vcf/BcftoolsMerge.py +31 -0
  88. biopipen/scripts/vcf/BcftoolsSort.py +4 -4
  89. biopipen/scripts/vcf/BcftoolsView.py +5 -5
  90. biopipen/scripts/vcf/Vcf2Bed.py +2 -2
  91. biopipen/scripts/vcf/VcfAnno.py +11 -11
  92. biopipen/scripts/vcf/VcfDownSample.sh +22 -10
  93. biopipen/scripts/vcf/VcfFilter.py +5 -5
  94. biopipen/scripts/vcf/VcfFix.py +7 -7
  95. biopipen/scripts/vcf/VcfFix_utils.py +12 -3
  96. biopipen/scripts/vcf/VcfIndex.py +3 -3
  97. biopipen/scripts/vcf/VcfIntersect.py +3 -3
  98. biopipen/scripts/vcf/VcfLiftOver.sh +5 -0
  99. biopipen/scripts/vcf/VcfSplitSamples.py +4 -4
  100. biopipen/scripts/vcf/bcftools_utils.py +3 -3
  101. biopipen/scripts/web/Download.py +8 -4
  102. biopipen/scripts/web/DownloadList.py +5 -5
  103. biopipen/scripts/web/GCloudStorageDownloadBucket.py +5 -5
  104. biopipen/scripts/web/GCloudStorageDownloadFile.py +3 -3
  105. biopipen/scripts/web/gcloud_common.py +1 -1
  106. biopipen/utils/gsea.R +75 -35
  107. biopipen/utils/misc.R +205 -7
  108. biopipen/utils/misc.py +17 -8
  109. biopipen/utils/reference.py +11 -11
  110. biopipen/utils/repr.R +146 -0
  111. biopipen/utils/vcf.py +1 -1
  112. {biopipen-0.32.3.dist-info → biopipen-0.33.1.dist-info}/METADATA +8 -8
  113. {biopipen-0.32.3.dist-info → biopipen-0.33.1.dist-info}/RECORD +115 -105
  114. {biopipen-0.32.3.dist-info → biopipen-0.33.1.dist-info}/WHEEL +1 -1
  115. biopipen/scripts/scrna/SeuratClusterStats-hists.R +0 -144
  116. biopipen/scripts/scrna/SeuratPreparing-common.R +0 -467
  117. biopipen/scripts/scrna/SeuratPreparing-doublet_detection.R +0 -204
  118. {biopipen-0.32.3.dist-info → biopipen-0.33.1.dist-info}/entry_points.txt +0 -0
biopipen/ns/snp.py CHANGED
@@ -183,7 +183,7 @@ class PlinkFromVcf(Proc):
183
183
  vcf_idspace_to: convert all spaces in sample IDs to this character.
184
184
  set_missing_var_ids: update variant IDs using a template string,
185
185
  with a '@' where the chromosome code should go, and a '#' where the
186
- base-pair position belongs. You can also specify `\$r` and `\$a` for
186
+ base-pair position belongs. You can also specify `\\$r` and `\\$a` for
187
187
  the reference and alternate alleles, respectively.
188
188
  See <https://www.cog-genomics.org/plink/2.0/data#set_all_var_ids>
189
189
  max_alleles (type=int): Maximum number of alleles per variant.
@@ -191,7 +191,7 @@ class PlinkFromVcf(Proc):
191
191
  Note that `_` will be replaced by `-` in the argument names.
192
192
  """ # noqa: E501
193
193
  input = "invcf:file"
194
- output = "outdir:dir:{{in.invcf | regex_replace: '\\.gz$', '' | stem}}"
194
+ output = "outdir:dir:{{in.invcf.stem | regex_replace: '\\.gz$', ''}}"
195
195
  lang = config.lang.python
196
196
  envs = {
197
197
  "plink": config.exe.plink2,
@@ -217,7 +217,14 @@ class Plink2GTMat(Proc):
217
217
 
218
218
  The allelic dosage is used as the values of genotype matrix.
219
219
  "--keep-allele-order" is used to keep the allele order consistent with the
220
- reference allele first.
220
+ reference allele first. This way, the genotype of homozygous reference alleles
221
+ will be encoded as 2, heterozygous as 1, and homozygous alternate alleles as 0.
222
+ This is the PLINK dosage encoding. If you want to use this encoding, you can
223
+ set `envs.gtcoding` to `plink`. Otherwise, the default encoding is `vcf`, which
224
+ will encode the genotype as 0, 1, and 2 for homozygous reference, heterozygous,
225
+ and homozygous alternate alleles, respectively.
226
+
227
+ Note that `envs.gtcoding = "vcf"` only works for biallelic variants for now.
221
228
 
222
229
  Input:
223
230
  indir: Input directory containing the PLINK files.
@@ -241,6 +248,11 @@ class Plink2GTMat(Proc):
241
248
  respectively.
242
249
  trans_chr: A dictionary to translate chromosome numbers to chromosome names.
243
250
  missing_id: what to use as the rs if missing.
251
+ gtcoding (choice): The genotype coding to use.
252
+ - vcf: 0/1/2 for homozygous reference, heterozygous, and homozygous
253
+ alternate alleles, respectively.
254
+ - plink: 2/1/0 for homozygous reference, heterozygous, and homozygous
255
+ alternate alleles, respectively.
244
256
  """
245
257
  input = "indir:dir"
246
258
  output = "outfile:file:{{in.indir | stem}}-gtmat.txt"
@@ -253,6 +265,7 @@ class Plink2GTMat(Proc):
253
265
  "varid": "{chr}_{pos}_{varid}_{ref}_{alt}",
254
266
  "trans_chr": {"23": "X", "24": "Y", "25": "XY", "26": "M"},
255
267
  "missing_id": "NA",
268
+ "gtcoding": "vcf",
256
269
  }
257
270
  script = "file://../scripts/snp/Plink2GTMat.py"
258
271
 
biopipen/ns/tcr.py CHANGED
@@ -39,7 +39,8 @@ class ImmunarchLoading(Proc):
39
39
  information.
40
40
 
41
41
  Output:
42
- rdsfile: The RDS file with the data and metadata
42
+ rdsfile: The RDS file with the data and metadata, which can be processed by
43
+ other `immunarch` functions.
43
44
  metatxt: The meta data at cell level, which can be used to attach to the Seurat object
44
45
 
45
46
  Envs:
@@ -1675,3 +1676,126 @@ class TCRDock(Proc):
1675
1676
  "data_dir": None,
1676
1677
  }
1677
1678
  script = "file://../scripts/tcr/TCRDock.py"
1679
+
1680
+
1681
+ class ScRepLoading(Proc):
1682
+ """Load the single cell TCR/BCR data into a `scRepertoire` compatible object
1683
+
1684
+ This process loads the single cell TCR/BCR data into a `scRepertoire`
1685
+ compatible object. Later, `scRepertoire::combineExpression` can be used to
1686
+ combine the expression data with the TCR/BCR data.
1687
+
1688
+ For the data path specified at `TCRData` in the input file, we will first find
1689
+ `filtered_contig_annotations.csv` and `filtered_config_annotations.csv.gz` in the
1690
+ path. If neighter of them exists, we will find `all_contig_annotations.csv` and
1691
+ `all_contig_annotations.csv.gz` in the path and a warning will be raised
1692
+ (You can find it at `./.pipen/<pipeline-name>/ImmunarchLoading/<job.index>/job.stderr`).
1693
+
1694
+ If none of the files exists, an error will be raised.
1695
+
1696
+ Input:
1697
+ metafile: The meta data of the samples
1698
+ A tab-delimited file
1699
+ Two columns are required:
1700
+ * `Sample` to specify the sample names.
1701
+ * `TCRData` to assign the path of the data to the samples,
1702
+ and this column will be excluded as metadata.
1703
+ Immunarch is able to fetch the sample names from the names of
1704
+ the target files. However, 10x data yields result like
1705
+ `filtered_contig_annotations.csv`, which doesn't have any name
1706
+ information.
1707
+
1708
+ Output:
1709
+ outfile: The `scRepertoire` compatible object in RDS format
1710
+
1711
+ Envs:
1712
+ combineTCR (type=json): The extra arguments for `scRepertoire::combineTCR` function.
1713
+ See also <https://www.borch.dev/uploads/screpertoire/reference/combinetcr>
1714
+ exclude (auto): The columns to exclude from the metadata to add to the object.
1715
+ A list of column names to exclude or a string with column names separated by `,`.
1716
+ By default, `TCRData` and `RNAData` will be excluded.
1717
+
1718
+ """ # noqa: E501
1719
+ input = "metafile:file"
1720
+ output = "outfile:file:{{in.metafile | stem}}.scRep.RDS"
1721
+ lang = config.lang.rscript
1722
+ envs = {"combineTCR": {"samples": True}, "exclude": ["TCRData", "RNAData"]}
1723
+ script = "file://../scripts/tcr/ScRepLoading.R"
1724
+
1725
+
1726
+ class ClonalStats(Proc):
1727
+ """Visualize the clonal information.
1728
+
1729
+ Using [`scplotter`](https://github.com/pwwang/scplotter) to visualize the clonal
1730
+ information.
1731
+
1732
+ Input:
1733
+ screpfile: The `scRepertoire` object in RDS format
1734
+
1735
+ Output:
1736
+ outdir: The output directory containing the plots
1737
+
1738
+ Envs:
1739
+ mutaters (type=json;order=-9): The mutaters passed to `dplyr::mutate()` to add new variables.
1740
+ When the object loaded form `in.screpfile` is a list, the mutaters will be applied to each element.
1741
+ The keys are the names of the new variables, and the values are the expressions.
1742
+ When it is a `Seurat` object, typically an output of `scRepertoire::combineExpression()`,
1743
+ the mutaters will be applied to the `meta.data`.
1744
+ viz_type (choice): The type of visualization to generate.
1745
+ - volume: The volume of the clones using [`ClonalVolumePlot`](https://pwwang.github.io/scplotter/reference/ClonalVolumePlot.html)
1746
+ - abundance: The abundance of the clones using [`ClonalAbundancePlot`](https://pwwang.github.io/scplotter/reference/ClonalAbundancePlot.html)
1747
+ - length: The length of the CDR3 sequences using [`ClonalLengthPlot`](https://pwwang.github.io/scplotter/reference/ClonalLengthPlot.html)
1748
+ - residency: The residency of the clones using [`ClonalResidencyPlot`](https://pwwang.github.io/scplotter/reference/ClonalResidencyPlot.html)
1749
+ - dynamics: The dynamics of the clones using [`ClonalDynamicsPlot`](https://pwwang.github.io/scplotter/reference/ClonalDynamicsPlot.html)
1750
+ - composition: The composition of the clones using [`ClonalCompositionPlot`](https://pwwang.github.io/scplotter/reference/ClonalCompositionPlot.html)
1751
+ - overlap: The overlap of the clones using [`ClonalOverlapPlot`](https://pwwang.github.io/scplotter/reference/ClonalOverlapPlot.html)
1752
+ - diversity: The diversity of the clones using [`ClonalDiversityPlot`](https://pwwang.github.io/scplotter/reference/ClonalDiversityPlot.html)
1753
+ - geneusage: The gene usage of the clones using [`ClonalGeneUsagePlot`](https://pwwang.github.io/scplotter/reference/ClonalGeneUsagePlot.html)
1754
+ - positional: The positional information of the clones using [`ClonalPositionalPlot`](https://pwwang.github.io/scplotter/reference/ClonalPositionalPlot.html)
1755
+ - kmer: The kmer information of the clones using [`ClonalKmerPlot`](https://pwwang.github.io/scplotter/reference/ClonalKmerPlot.html)
1756
+ - rarefaction: The rarefaction curve of the clones using [`ClonalRarefactionPlot`](https://pwwang.github.io/scplotter/reference/ClonalRarefactionPlot.html)
1757
+ subset: An expression to subset the data before plotting.
1758
+ Similar to `mutaters`, it will be applied to each element by `dplyr::filter()` if the object
1759
+ loaded form `in.screpfile` is a list; otherwise, it will be applied to
1760
+ `subset(sobj, subset = <expr>)` if the object is a `Seurat` object.
1761
+ devpars (ns): The parameters for the plotting device.
1762
+ - width (type=int): The width of the device
1763
+ - height (type=int): The height of the device
1764
+ - res (type=int): The resolution of the device
1765
+ more_formats (list): The extra formats to save the plots in, other than PNG.
1766
+ save_code (flag): Whether to save the code used to generate the plots
1767
+ Note that the data directly used to generate the plots will also be saved in an `rda` file.
1768
+ Be careful if the data is large as it may take a lot of disk space.
1769
+ descr: The description of the plot, used to show in the report.
1770
+ <more>: The arguments for the plot function
1771
+ See the documentation of the corresponding plot function for the details
1772
+ cases (type=json): The cases to generate the plots if we have multiple cases.
1773
+ The keys are the names of the cases, and the values are the arguments for the plot function.
1774
+ The arguments in `envs` will be used if not specified in `cases`, except for `mutaters`.
1775
+ Sections can be specified as the prefix of the case name, separated by `::`.
1776
+ For example, if you have a case named `Clonal Volume::Case1`, the plot will be put in the
1777
+ section `Clonal Volume`. By default, when there are multiple cases for the same 'viz_type', the name of the 'viz_type' will be used
1778
+ as the default section name (for example, when 'viz_type' is 'volume', the section name will be 'Clonal Volume').
1779
+ When there is only a single case, the section name will default to 'DEFAULT', which will not be shown
1780
+ in the report.
1781
+ """ # noqa: E501
1782
+ input = "screpfile:file"
1783
+ output = "outdir:dir:{{in.screpfile | stem}}.clonalstats"
1784
+ lang = config.lang.rscript
1785
+ envs = {
1786
+ "mutaters": {},
1787
+ "subset": None,
1788
+ "viz_type": None,
1789
+ "devpars": {"width": None, "height": None, "res": 100},
1790
+ "more_formats": [],
1791
+ "save_code": False,
1792
+ "descr": None,
1793
+ "cases": {
1794
+ "Clonal Volume": {"viz_type": "volume"},
1795
+ "Clonal Abundance": {"viz_type": "abundance"},
1796
+ "CDR3 Length": {"viz_type": "length"},
1797
+ "Clonal Diversity": {"viz_type": "diversity"},
1798
+ }
1799
+ }
1800
+ script = "file://../scripts/tcr/ClonalStats.R"
1801
+ plugin_opts = {"report": "file://../reports/tcr/ClonalStats.svelte"}
biopipen/ns/vcf.py CHANGED
@@ -595,6 +595,40 @@ class BcftoolsSort(Proc):
595
595
  script = "file://../scripts/vcf/BcftoolsSort.py"
596
596
 
597
597
 
598
+ class BcftoolsMerge(Proc):
599
+ """Merge multiple VCF files using `bcftools merge`.
600
+
601
+ Input:
602
+ infiles: The input VCF files
603
+
604
+ Output:
605
+ outfile: The merged VCF file.
606
+
607
+ Envs:
608
+ bcftools: Path to bcftools
609
+ tabix: Path to tabix, used to index infile/outfile
610
+ ncores (type=int): Number of cores (`--threads`) to use
611
+ gz (flag): Whether to gzip the output file
612
+ index (flag): Whether to index the output file (tbi) (`envs.gz` forced to True)
613
+ <more>: Other arguments for `bcftools merge`.
614
+ See also <https://samtools.github.io/bcftools/bcftools.html#merge>
615
+ """
616
+ input = "infiles:files"
617
+ output = (
618
+ "outfile:file:{{in.infiles | first | stem | append: '_etc_merged'}}.vcf"
619
+ "{{'.gz' if envs.index or envs.gz else ''}}"
620
+ )
621
+ lang = config.lang.python
622
+ envs = {
623
+ "bcftools": config.exe.bcftools,
624
+ "tabix": config.exe.tabix,
625
+ "ncores": config.misc.ncores,
626
+ "gz": True,
627
+ "index": True,
628
+ }
629
+ script = "file://../scripts/vcf/BcftoolsMerge.py"
630
+
631
+
598
632
  class BcftoolsView(Proc):
599
633
  """View, subset and filter VCF files by position and filtering expression.
600
634
 
biopipen/ns/web.py CHANGED
@@ -32,7 +32,11 @@ class Download(Proc):
32
32
  input = "url"
33
33
  output = (
34
34
  "outfile:file:"
35
- "{{in.url | basename | replace: '%2E', '.' | slugify: separator='.'}}"
35
+ """{{in.url
36
+ | basename
37
+ | url_decode
38
+ | slugify: separator='.', lowercase=False, regex_pattern='[^-a-zA-Z0-9_]+'
39
+ }}"""
36
40
  )
37
41
  lang = config.lang.python
38
42
  envs = {
@@ -1,7 +1,7 @@
1
1
  {% from "utils/misc.liq" import report_jobs, table_of_images -%}
2
2
  {% from_ os import path %}
3
3
  <script>
4
- import { DataTable, Image, Descr } from "$libs";
4
+ import { DataTable, Image, Descr, Plotly } from "$libs";
5
5
  import { Tabs, Tab, TabContent } from "$ccs";
6
6
  </script>
7
7
 
@@ -6,8 +6,21 @@
6
6
  {%- macro report_job(job, h=1) -%}
7
7
 
8
8
  <h{{h}}>UMAPs</h{{h}}>
9
- {% set imgs = job.outdir | glob: "UMAPs-*.png" %}
10
- {{ table_of_images(imgs) }}
9
+ {% set imgs = [] %}
10
+ {% set caps = [] %}
11
+ {% for png in job.outdir | glob: "UMAPs-*.png" %}
12
+ {% set pdf = png | regex_replace: "\\.png$", ".pdf" %}
13
+ {% set stm = png | stem %}
14
+ {% set _ = imgs.append({"src": png, "download": pdf}) %}
15
+ {% set _ = caps.append(stm | replace: "UMAPs-", "") %}
16
+ {% endfor %}
17
+ {{ table_of_images(imgs, caps) }}
18
+
19
+ <h{{h}}>Mapping Score</h{{h}}>
20
+ <Image
21
+ src="{{job.outdir | joinpath: 'mapping_score.png'}}"
22
+ download="{{job.outdir | joinpath: 'mapping_score.pdf'}}"
23
+ />
11
24
 
12
25
  <h{{h}}>Stats</h{{h}}>
13
26
  {% for stfile in job.outdir | glob: "stats-*.txt" %}
@@ -0,0 +1,15 @@
1
+ {% from "utils/misc.liq" import report_jobs, table_of_images -%}
2
+
3
+ <script>
4
+ import { Image, DataTable, Descr } from "$libs";
5
+ </script>
6
+
7
+ {%- macro report_job(job, h=1) -%}
8
+ {{ job | render_job: h=h }}
9
+ {%- endmacro -%}
10
+
11
+ {%- macro head_job(job) -%}
12
+ <h1>{{job.in.screpfile | stem | escape }}</h1>
13
+ {%- endmacro -%}
14
+
15
+ {{ report_jobs(jobs, head_job, report_job) }}
@@ -25,7 +25,14 @@ import { Image } from "$libs";
25
25
  {% for batch_srcs in srcs | batch: col, "" %}
26
26
  {% set outer_loop = loop %}
27
27
  <tr>
28
- {% for src in batch_srcs %}
28
+ {% for srcinfo in batch_srcs %}
29
+ {% if srcinfo | isinstance: str %}
30
+ {% set src = srcinfo %}
31
+ {% set download = None %}
32
+ {% else %}
33
+ {% set src = srcinfo['src'] %}
34
+ {% set download = srcinfo.get('download', None) %}
35
+ {% endif %}
29
36
  {% set i = col * outer_loop.index0 + loop.index0 %}
30
37
  {% if i >= len(srcs) %}
31
38
  <td style="width: {{table_width / col}}%"></td>
@@ -33,21 +40,27 @@ import { Image } from "$libs";
33
40
  <td style="width: {{table_width / col}}%; vertical-align:top;">
34
41
  {% if caps is none %}
35
42
  <div
36
- style="padding-left: 28px; font-weight: bold; padding-top: 10px; margin-bottom: -10px;">
43
+ style="padding-left: 28px; font-weight: bold; padding-top: 16px;">
37
44
  {{ src | stem }}
38
45
  </div>
39
46
  {% elif caps is false %}
40
47
  {% else %}
41
48
  <div
42
- style="padding-left: 28px; font-weight: bold; padding-top: 10px; margin-bottom: -10px;">
49
+ style="padding-left: 28px; font-weight: bold; padding-top: 16px;">
43
50
  {{ caps[i] }}
44
51
  </div>
45
52
  {% endif %}
46
- {% if src | replace: ".png", ".pdf" | exists %}
47
- <Image style="max-width: 90%" src={{src | quote}}
48
- download={{src | replace: ".png", ".pdf" | quote}} />
53
+ {% if download %}
54
+ <Image
55
+ style="max-width: 90%"
56
+ src={{src | quote}}
57
+ download={ {{download | json}} }
58
+ />
49
59
  {% else %}
50
- <Image style="max-width: 90%" src={{src | quote}} />
60
+ <Image
61
+ style="max-width: 90%"
62
+ src={{src | quote}}
63
+ />
51
64
  {% endif %}
52
65
  </td>
53
66
  {% endif %}
@@ -1,8 +1,8 @@
1
1
  from pathlib import Path
2
2
  from biopipen.utils.misc import run_command, logger
3
3
 
4
- bamfiles = {{in.bamfiles | repr}} # pyright: ignore # noqa
5
- outfile = Path({{out.outfile | repr}}) # pyright: ignore
4
+ bamfiles = {{in.bamfiles | default: [] | each: str}} # pyright: ignore # noqa
5
+ outfile = Path({{out.outfile | quote}}) # pyright: ignore
6
6
  ncores = {{envs.ncores | int}} # pyright: ignore
7
7
  tool = {{envs.tool | quote}} # pyright: ignore
8
8
  samtools = {{envs.samtools | quote}} # pyright: ignore
@@ -4,12 +4,12 @@ from biopipen.utils.misc import run_command, logger
4
4
  # using:
5
5
  # samtools view --subsample 0.1 --subsample-seed 1234 --threads 4 -b -o out.bam in.bam
6
6
 
7
- bamfile = {{ in.bamfile | repr }} # pyright: ignore # noqa
8
- outfile = Path({{ out.outfile | repr }}) # pyright: ignore
7
+ bamfile = {{ in.bamfile | quote }} # pyright: ignore # noqa
8
+ outfile = Path({{ out.outfile | quote }}) # pyright: ignore
9
9
  ncores = {{ envs.ncores | int }} # pyright: ignore
10
10
  samtools = {{ envs.samtools | repr }} # pyright: ignore
11
11
  tool = {{ envs.tool | repr }} # pyright: ignore
12
- fraction = {{ envs.fraction | repr }} # pyright: ignore
12
+ fraction: float = {{ envs.fraction | repr }} # pyright: ignore
13
13
  seed = {{ envs.seed | int }} # pyright: ignore
14
14
  should_index = {{ envs.index | repr }} # pyright: ignore
15
15
  should_sort = {{ envs.sort | repr }} # pyright: ignore
@@ -38,7 +38,7 @@ if fraction > 1:
38
38
  "-c",
39
39
  bamfile
40
40
  ]
41
- nreads = run_command(cmd, stdout="return").strip()
41
+ nreads = run_command(cmd, stdout="return").strip() # type: ignore
42
42
  fraction = fraction / float(int(nreads))
43
43
 
44
44
  ofile = (
@@ -0,0 +1,141 @@
1
+ from hashlib import md5
2
+ from pathlib import Path
3
+ from biopipen.utils.misc import run_command, dict_to_cli_args
4
+
5
+ infile: str = {{ in.bamfile | quote }} # pyright: ignore # noqa
6
+ outfile = Path({{ out.outfile | quote }}) # pyright: ignore
7
+ args: dict = {{ envs | dict | repr }} # pyright: ignore
8
+ ncores = args.pop("ncores")
9
+ tool = args.pop("tool")
10
+ samtools = args.pop("samtools")
11
+ sambamba = args.pop("sambamba")
12
+ tmpdir = args.pop("tmpdir")
13
+ byname = args.pop("byname")
14
+ should_index = args.pop("index")
15
+ sig = md5(infile.encode()).hexdigest()
16
+ tmpdir = Path(tmpdir).joinpath(
17
+ f"biopipen_BamSort_{{job.index}}_{sig}_{Path(infile).name}"
18
+ )
19
+ tmpdir.mkdir(parents=True, exist_ok=True)
20
+ tmpdir = str(tmpdir)
21
+
22
+
23
+ def use_samtools():
24
+ """Use samtools to sort/index bam file.
25
+
26
+ Usage: samtools sort [options...] [in.bam]
27
+ Options:
28
+ -l INT Set compression level, from 0 (uncompressed) to 9 (best)
29
+ -u Output uncompressed data (equivalent to -l 0)
30
+ -m INT Set maximum memory per thread; suffix K/M/G recognized [768M]
31
+ -M Use minimiser for clustering unaligned/unplaced reads
32
+ -K INT Kmer size to use for minimiser [20]
33
+ -n Sort by read name (not compatible with samtools index command)
34
+ -t TAG Sort by value of TAG. Uses position as secondary index (or read name if -n is set)
35
+ -o FILE Write final output to FILE rather than standard output
36
+ -T PREFIX Write temporary files to PREFIX.nnnn.bam
37
+ --no-PG
38
+ Do not add a PG line
39
+ --template-coordinate
40
+ Sort by template-coordinate
41
+ --input-fmt-option OPT[=VAL]
42
+ Specify a single input file format option in the form
43
+ of OPTION or OPTION=VALUE
44
+ -O, --output-fmt FORMAT[,OPT[=VAL]]...
45
+ Specify output format (SAM, BAM, CRAM)
46
+ --output-fmt-option OPT[=VAL]
47
+ Specify a single output file format option in the form
48
+ of OPTION or OPTION=VALUE
49
+ --reference FILE
50
+ Reference sequence FASTA FILE [null]
51
+ -@, --threads INT
52
+ Number of additional threads to use [0]
53
+ --write-index
54
+ Automatically index the output files [off]
55
+ --verbosity INT
56
+ Set level of verbosity
57
+ """ # noqa
58
+ sargs = args.copy()
59
+ sargs["n"] = byname
60
+ sargs["T"] = f"{tmpdir}/tmp"
61
+ sargs["threads"] = ncores
62
+
63
+ if should_index:
64
+ sargs["write-index"] = True
65
+ # https://github.com/samtools/samtools/issues/1196
66
+ sargs["o"] = f"{outfile}##idx##{outfile}.bai"
67
+ else:
68
+ sargs["o"] = outfile
69
+
70
+ n_outfmt = sum(["O" in sargs, "output-fmt" in sargs])
71
+ if n_outfmt > 1:
72
+ raise ValueError(
73
+ "envs.args cannot contain both 'O' and 'output-fmt'"
74
+ )
75
+ if n_outfmt == 0:
76
+ sargs["O"] = "BAM"
77
+
78
+ cmd = [
79
+ samtools,
80
+ "sort",
81
+ *dict_to_cli_args(sargs),
82
+ infile,
83
+ ]
84
+ run_command(cmd)
85
+
86
+
87
+ def use_sambamba():
88
+ """Use sambamba to sort/index bam file.
89
+
90
+ sambamba 0.8.2
91
+ by Artem Tarasov and Pjotr Prins (C) 2012-2021
92
+ LDC 1.28.1 / DMD v2.098.1 / LLVM12.0.0 / bootstrap LDC - the LLVM D compiler (1.28.1)
93
+
94
+ Usage: sambamba-sort [options] <input.bam>
95
+
96
+ Options: -m, --memory-limit=LIMIT
97
+ approximate total memory limit for all threads (by default 2GB)
98
+ --tmpdir=TMPDIR
99
+ directory for storing intermediate files; default is system directory for temporary files
100
+ -o, --out=OUTPUTFILE
101
+ output file name; if not provided, the result is written to a file with .sorted.bam extension
102
+ -n, --sort-by-name
103
+ sort by read name instead of coordinate (lexicographical order)
104
+ --sort-picard
105
+ sort by query name like in picard
106
+ -N, --natural-sort
107
+ sort by read name instead of coordinate (so-called 'natural' sort as in samtools)
108
+ -M, --match-mates
109
+ pull mates of the same alignment together when sorting by read name
110
+ -l, --compression-level=COMPRESSION_LEVEL
111
+ level of compression for sorted BAM, from 0 to 9
112
+ -u, --uncompressed-chunks
113
+ write sorted chunks as uncompressed BAM (default is writing with compression level 1), that might be faster in some cases but uses more disk space
114
+ -p, --show-progress
115
+ show progressbar in STDERR
116
+ -t, --nthreads=NTHREADS
117
+ use specified number of threads
118
+ -F, --filter=FILTER
119
+ keep only reads that satisfy FILTER
120
+ """ # noqa
121
+ sargs = args.copy()
122
+ sargs["nthreads"] = ncores
123
+ sargs["n"] = byname
124
+ sargs["tmpdir"] = tmpdir
125
+ sargs["o"] = outfile
126
+ cmd = [
127
+ sambamba,
128
+ "sort",
129
+ *dict_to_cli_args(sargs, sep="="),
130
+ infile,
131
+ ]
132
+ run_command(cmd)
133
+
134
+
135
+ if __name__ == "__main__":
136
+ if tool == "samtools":
137
+ use_samtools()
138
+ elif tool == "sambamba":
139
+ use_sambamba()
140
+ else:
141
+ raise ValueError(f"Unknown tool: {tool}")
@@ -2,12 +2,12 @@ from pathlib import Path
2
2
  from biopipen.utils.misc import run_command
3
3
  from biopipen.utils.reference import bam_index
4
4
 
5
- bamfile = {{in.bamfile | quote}} # pyright: ignore
6
- outdir = {{out.outdir | quote}} # pyright: ignore
7
- tool = {{envs.tool | quote}} # pyright: ignore
8
- samtools = {{envs.samtools | quote}} # pyright: ignore
9
- sambamba = {{envs.sambamba | quote}} # pyright: ignore
10
- ncores = {{envs.ncores | repr}} # pyright: ignore
5
+ bamfile: str = {{in.bamfile | quote}} # pyright: ignore # noqa
6
+ outdir: str = {{out.outdir | quote}} # pyright: ignore
7
+ tool: str = {{envs.tool | quote}} # pyright: ignore
8
+ samtools: str = {{envs.samtools | quote}} # pyright: ignore
9
+ sambamba: str = {{envs.sambamba | quote}} # pyright: ignore
10
+ ncores: int = {{envs.ncores | repr}} # pyright: ignore
11
11
  keep_other_sq = {{envs.keep_other_sq | repr}} # pyright: ignore
12
12
  chroms_to_keep = {{envs.chroms | repr}} # pyright: ignore
13
13
  should_index = {{envs.index | bool}} # pyright: ignore
@@ -17,13 +17,13 @@ def _remove_other_sq(infile, chrom, outfile):
17
17
  exe = samtools if tool == "samtools" else sambamba
18
18
  print("\nRemoving other chromosomes in @SQ in header")
19
19
  header_cmd = [exe, "view", "-H", infile]
20
- header_p = run_command(
20
+ header_p = run_command( # type: ignore
21
21
  header_cmd,
22
22
  stdout=True,
23
23
  wait=False,
24
24
  print_command=True,
25
25
  )
26
- header = header_p.stdout.read().decode().strip().splitlines()
26
+ header = header_p.stdout.read().decode().strip().splitlines() # type: ignore
27
27
  new_header = []
28
28
  for line in header:
29
29
  if line.startswith("@SQ"):
@@ -63,7 +63,7 @@ def use_samtools():
63
63
  "| grep '^@SQ' | cut -f 2 | cut -d ':' -f 2"
64
64
  )
65
65
  p = run_command(cmd, stdout=True, wait=False)
66
- chroms = p.stdout.read().decode().strip().splitlines()
66
+ chroms = p.stdout.read().decode().strip().splitlines() # type: ignore
67
67
  else:
68
68
  print("\nUsing provided chromosomes")
69
69
  chroms = chroms_to_keep
@@ -121,7 +121,7 @@ def use_sambamba():
121
121
  "| grep '^@SQ' | cut -f 2 | cut -d ':' -f 2"
122
122
  )
123
123
  p = run_command(cmd, stdout=True, wait=False)
124
- chroms = p.stdout.read().decode().splitlines()
124
+ chroms = p.stdout.read().decode().splitlines() # type: ignore
125
125
  else:
126
126
  print("\nUsing provided chromosomes")
127
127
  chroms = chroms_to_keep
@@ -4,9 +4,9 @@ from biopipen.utils.misc import run_command, logger
4
4
  # using:
5
5
  # samtools view --subsample 0.1 --subsample-seed 1234 --threads 4 -b -o out.bam in.bam
6
6
 
7
- bamfile = {{ in.bamfile | repr }} # pyright: ignore # noqa
8
- bedfile = {{ in.bedfile | repr }} # pyright: ignore # noqa
9
- outfile = Path({{ out.outfile | repr }}) # pyright: ignore
7
+ bamfile = {{ in.bamfile | quote }} # pyright: ignore # noqa
8
+ bedfile = {{ in.bedfile | quote }} # pyright: ignore # noqa
9
+ outfile = Path({{ out.outfile | quote }}) # pyright: ignore
10
10
  ncores = {{ envs.ncores | int }} # pyright: ignore
11
11
  samtools = {{ envs.samtools | repr }} # pyright: ignore
12
12
  tool = {{ envs.tool | repr }} # pyright: ignore
@@ -6,17 +6,17 @@ from datetime import datetime
6
6
  from biopipen.utils.reference import bam_index
7
7
  from biopipen.utils.misc import run_command, dict_to_cli_args, logger
8
8
 
9
- bamfile = {{in.bamfile | quote}} # pyright: ignore # noqa
10
- snpfile = {{in.snpfile | repr}} # pyright: ignore
9
+ bamfile: str = {{in.bamfile | quote}} # pyright: ignore # noqa
10
+ snpfile: str = {{in.snpfile | quote}} # pyright: ignore
11
11
  outdir = Path({{out.outdir | quote}}) # pyright: ignore
12
- cnvpytor = {{envs.cnvpytor | quote}} # pyright: ignore
13
- samtools = {{envs.samtools | quote}} # pyright: ignore
14
- ncores = {{envs.ncores | int}} # pyright: ignore
12
+ cnvpytor: str = {{envs.cnvpytor | quote}} # pyright: ignore
13
+ samtools: str = {{envs.samtools | quote}} # pyright: ignore
14
+ ncores: int = {{envs.ncores | int}} # pyright: ignore
15
15
  refdir = {{envs.refdir | quote}} # pyright: ignore
16
16
  genome = {{envs.genome | quote}} # pyright: ignore
17
- chrsize = {{envs.chrsize | quote}} # pyright: ignore
18
- filters = {{envs.filters | repr}} # pyright: ignore
19
- args = {{envs | repr}} # pyright: ignore
17
+ chrsize: str = {{envs.chrsize | quote}} # pyright: ignore
18
+ filters: dict = {{envs.filters | repr}} # pyright: ignore
19
+ args: dict = {{envs | dict}} # pyright: ignore
20
20
 
21
21
  del args['cnvpytor']
22
22
  del args['ncores']
@@ -27,7 +27,7 @@ del args['chrsize']
27
27
  del args['filters']
28
28
 
29
29
 
30
- bamfile = bam_index(bamfile, outdir, samtools, ncores)
30
+ bamfile: Path = bam_index(bamfile, str(outdir), samtools, ncores=ncores)
31
31
 
32
32
  NOSNP_COLS = [
33
33
  "CNVtype",
@@ -293,7 +293,7 @@ def cnvpytor2vcf(infile, snp):
293
293
  fout.write('##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n')
294
294
  fout.write('##FORMAT=<ID=CN,Number=1,Type=Integer,Description="Copy number genotype for imprecise events">\n')
295
295
  fout.write('##FORMAT=<ID=PE,Number=1,Type=String,Description="Number of paired-ends that support the event">\n')
296
- fout.write(f"#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t{Path(bamfile).stem}\n")
296
+ fout.write(f"#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t{bamfile.stem}\n")
297
297
  prev_chrom, chrom_seq, count = "", "", 0
298
298
  for line in fin:
299
299
  # type, coor, length, rd, p1, p2, p3, p4, q0, pe = line.strip("\n").split()