biopipen 0.32.3__py3-none-any.whl → 0.33.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of biopipen might be problematic. Click here for more details.

Files changed (117) hide show
  1. biopipen/__init__.py +1 -1
  2. biopipen/core/config.toml +6 -0
  3. biopipen/core/filters.py +35 -23
  4. biopipen/core/testing.py +6 -1
  5. biopipen/ns/bam.py +39 -0
  6. biopipen/ns/cellranger.py +5 -0
  7. biopipen/ns/cellranger_pipeline.py +2 -2
  8. biopipen/ns/cnvkit_pipeline.py +4 -1
  9. biopipen/ns/delim.py +33 -27
  10. biopipen/ns/protein.py +99 -0
  11. biopipen/ns/scrna.py +411 -250
  12. biopipen/ns/snp.py +16 -3
  13. biopipen/ns/tcr.py +125 -1
  14. biopipen/ns/vcf.py +34 -0
  15. biopipen/ns/web.py +5 -1
  16. biopipen/reports/scrna/SeuratClusterStats.svelte +1 -1
  17. biopipen/reports/scrna/SeuratMap2Ref.svelte +15 -2
  18. biopipen/reports/tcr/ClonalStats.svelte +15 -0
  19. biopipen/reports/utils/misc.liq +20 -7
  20. biopipen/scripts/bam/BamMerge.py +2 -2
  21. biopipen/scripts/bam/BamSampling.py +4 -4
  22. biopipen/scripts/bam/BamSort.py +141 -0
  23. biopipen/scripts/bam/BamSplitChroms.py +10 -10
  24. biopipen/scripts/bam/BamSubsetByBed.py +3 -3
  25. biopipen/scripts/bam/CNVpytor.py +10 -10
  26. biopipen/scripts/bam/ControlFREEC.py +11 -11
  27. biopipen/scripts/bed/Bed2Vcf.py +5 -5
  28. biopipen/scripts/bed/BedConsensus.py +5 -5
  29. biopipen/scripts/bed/BedLiftOver.sh +6 -4
  30. biopipen/scripts/bed/BedtoolsIntersect.py +4 -4
  31. biopipen/scripts/bed/BedtoolsMakeWindows.py +3 -3
  32. biopipen/scripts/bed/BedtoolsMerge.py +4 -4
  33. biopipen/scripts/cellranger/CellRangerCount.py +20 -9
  34. biopipen/scripts/cellranger/CellRangerSummary.R +20 -29
  35. biopipen/scripts/cellranger/CellRangerVdj.py +8 -8
  36. biopipen/scripts/cnvkit/CNVkitAccess.py +6 -6
  37. biopipen/scripts/cnvkit/CNVkitAutobin.py +25 -18
  38. biopipen/scripts/cnvkit/CNVkitBatch.py +5 -5
  39. biopipen/scripts/cnvkit/CNVkitCall.py +3 -3
  40. biopipen/scripts/cnvkit/CNVkitCoverage.py +2 -2
  41. biopipen/scripts/cnvkit/CNVkitDiagram.py +5 -5
  42. biopipen/scripts/cnvkit/CNVkitFix.py +3 -3
  43. biopipen/scripts/cnvkit/CNVkitGuessBaits.py +9 -5
  44. biopipen/scripts/cnvkit/CNVkitHeatmap.py +4 -4
  45. biopipen/scripts/cnvkit/CNVkitReference.py +2 -2
  46. biopipen/scripts/cnvkit/CNVkitScatter.py +5 -5
  47. biopipen/scripts/cnvkit/CNVkitSegment.py +5 -5
  48. biopipen/scripts/cnvkit/guess_baits.py +166 -93
  49. biopipen/scripts/delim/SampleInfo.R +85 -148
  50. biopipen/scripts/misc/Config2File.py +2 -2
  51. biopipen/scripts/misc/Str2File.py +2 -2
  52. biopipen/scripts/protein/MMCIF2PDB.py +33 -0
  53. biopipen/scripts/protein/PDB2Fasta.py +60 -0
  54. biopipen/scripts/protein/Prodigy.py +4 -4
  55. biopipen/scripts/protein/RMSD.py +178 -0
  56. biopipen/scripts/regulatory/MotifScan.py +8 -8
  57. biopipen/scripts/scrna/CellCellCommunication.py +59 -22
  58. biopipen/scripts/scrna/MarkersFinder.R +273 -654
  59. biopipen/scripts/scrna/RadarPlots.R +73 -53
  60. biopipen/scripts/scrna/SCP-plot.R +15202 -0
  61. biopipen/scripts/scrna/ScVelo.py +0 -0
  62. biopipen/scripts/scrna/SeuratClusterStats-clustree.R +23 -31
  63. biopipen/scripts/scrna/SeuratClusterStats-dimplots.R +26 -54
  64. biopipen/scripts/scrna/SeuratClusterStats-features.R +85 -403
  65. biopipen/scripts/scrna/SeuratClusterStats-ngenes.R +32 -17
  66. biopipen/scripts/scrna/SeuratClusterStats-stats.R +45 -239
  67. biopipen/scripts/scrna/SeuratClusterStats.R +13 -19
  68. biopipen/scripts/scrna/SeuratMap2Ref.R +16 -12
  69. biopipen/scripts/scrna/SeuratPreparing.R +138 -81
  70. biopipen/scripts/scrna/SlingShot.R +71 -0
  71. biopipen/scripts/scrna/celltypist-wrapper.py +7 -6
  72. biopipen/scripts/snp/Plink2GTMat.py +26 -11
  73. biopipen/scripts/snp/PlinkFilter.py +7 -7
  74. biopipen/scripts/snp/PlinkFromVcf.py +8 -5
  75. biopipen/scripts/snp/PlinkSimulation.py +4 -4
  76. biopipen/scripts/snp/PlinkUpdateName.py +4 -4
  77. biopipen/scripts/stats/ChowTest.R +48 -22
  78. biopipen/scripts/tcgamaf/Maf2Vcf.py +2 -2
  79. biopipen/scripts/tcgamaf/MafAddChr.py +2 -2
  80. biopipen/scripts/tcr/ClonalStats.R +484 -0
  81. biopipen/scripts/tcr/ScRepLoading.R +127 -0
  82. biopipen/scripts/tcr/TCRDock.py +10 -6
  83. biopipen/scripts/tcr/vdjtools-patch.sh +1 -1
  84. biopipen/scripts/vcf/BcftoolsAnnotate.py +8 -8
  85. biopipen/scripts/vcf/BcftoolsFilter.py +3 -3
  86. biopipen/scripts/vcf/BcftoolsMerge.py +31 -0
  87. biopipen/scripts/vcf/BcftoolsSort.py +4 -4
  88. biopipen/scripts/vcf/BcftoolsView.py +5 -5
  89. biopipen/scripts/vcf/Vcf2Bed.py +2 -2
  90. biopipen/scripts/vcf/VcfAnno.py +11 -11
  91. biopipen/scripts/vcf/VcfDownSample.sh +22 -10
  92. biopipen/scripts/vcf/VcfFilter.py +5 -5
  93. biopipen/scripts/vcf/VcfFix.py +7 -7
  94. biopipen/scripts/vcf/VcfFix_utils.py +12 -3
  95. biopipen/scripts/vcf/VcfIndex.py +3 -3
  96. biopipen/scripts/vcf/VcfIntersect.py +3 -3
  97. biopipen/scripts/vcf/VcfLiftOver.sh +5 -0
  98. biopipen/scripts/vcf/VcfSplitSamples.py +4 -4
  99. biopipen/scripts/vcf/bcftools_utils.py +3 -3
  100. biopipen/scripts/web/Download.py +8 -4
  101. biopipen/scripts/web/DownloadList.py +5 -5
  102. biopipen/scripts/web/GCloudStorageDownloadBucket.py +5 -5
  103. biopipen/scripts/web/GCloudStorageDownloadFile.py +3 -3
  104. biopipen/scripts/web/gcloud_common.py +1 -1
  105. biopipen/utils/gsea.R +75 -35
  106. biopipen/utils/misc.R +205 -7
  107. biopipen/utils/misc.py +17 -8
  108. biopipen/utils/reference.py +11 -11
  109. biopipen/utils/repr.R +146 -0
  110. biopipen/utils/vcf.py +1 -1
  111. {biopipen-0.32.3.dist-info → biopipen-0.33.0.dist-info}/METADATA +8 -8
  112. {biopipen-0.32.3.dist-info → biopipen-0.33.0.dist-info}/RECORD +114 -105
  113. {biopipen-0.32.3.dist-info → biopipen-0.33.0.dist-info}/WHEEL +1 -1
  114. biopipen/scripts/scrna/SeuratClusterStats-hists.R +0 -144
  115. biopipen/scripts/scrna/SeuratPreparing-common.R +0 -467
  116. biopipen/scripts/scrna/SeuratPreparing-doublet_detection.R +0 -204
  117. {biopipen-0.32.3.dist-info → biopipen-0.33.0.dist-info}/entry_points.txt +0 -0
biopipen/ns/scrna.py CHANGED
@@ -97,8 +97,8 @@ class SeuratPreparing(Proc):
97
97
 
98
98
  Output:
99
99
  rdsfile: The RDS file with the Seurat object with all samples integrated.
100
- Note that the cell ids are preficed with sample names QC plots will be
101
- saved in `<job.outdir>/before-qc` and `<job.outdir>/after-qc`.
100
+ Note that the cell ids are prefixied with sample names.
101
+ QC plots will be saved in `<job.outdir>/plots`.
102
102
 
103
103
  Envs:
104
104
  ncores (type=int): Number of cores to use.
@@ -140,6 +140,19 @@ class SeuratPreparing(Proc):
140
140
  will keep genes that are expressed in at least 3 cells.
141
141
  ///
142
142
 
143
+ qc_plots (type=json): The plots for QC metrics.
144
+ It should be a json (or python dict) with the keys as the names of the plots and
145
+ the values also as dicts with the following keys:
146
+ * kind: The kind of QC. Either `gene` or `cell` (default).
147
+ * devpars: The device parameters for the plot. A dict with `res`, `height`, and `width`.
148
+ * more_formats: The formats to save the plots other than `png`.
149
+ * save_code: Whether to save the code to reproduce the plot.
150
+ * other arguments passed to
151
+ [`biopipen.utils::VizSeuratCellQC`](https://pwwang.github.io/biopipen.utils.R/reference/VizSeuratCellQC.html)
152
+ when `kind` is `cell` or
153
+ [`biopipen.utils::VizSeuratGeneQC`](https://pwwang.github.io/biopipen.utils.R/reference/VizSeuratGeneQC.html)
154
+ when `kind` is `gene`.
155
+
143
156
  use_sct (flag): Whether use SCTransform routine to integrate samples or not.
144
157
  Before the following procedures, the `RNA` layer will be split by samples.
145
158
 
@@ -244,6 +257,7 @@ class SeuratPreparing(Proc):
244
257
  r-bracer:
245
258
  - check: {{proc.lang}} <(echo "library(bracer)")
246
259
  """ # noqa: E501
260
+
247
261
  input = "metafile:file"
248
262
  output = "rdsfile:file:{{in.metafile | stem}}.seurat.RDS"
249
263
  lang = config.lang.rscript
@@ -252,6 +266,28 @@ class SeuratPreparing(Proc):
252
266
  "cell_qc": None, # "nFeature_RNA > 200 & percent.mt < 5",
253
267
  "cell_qc_per_sample": False,
254
268
  "gene_qc": {"min_cells": 0, "excludes": []},
269
+ "qc_plots": {
270
+ "Violin Plots of QC Metrics": {
271
+ "kind": "cell",
272
+ "plot_type": "violin",
273
+ "devpars": {"res": 100, "height": 600, "width": 1200},
274
+ },
275
+ "Scatter Plots of QC Metrics": {
276
+ "kind": "cell",
277
+ "plot_type": "scatter",
278
+ "devpars": {"res": 100, "height": 800, "width": 1200},
279
+ },
280
+ "Ridge Plots of QC Metrics": {
281
+ "kind": "cell",
282
+ "plot_type": "ridge",
283
+ "devpars": {"res": 100, "height": 800, "width": 1200},
284
+ },
285
+ # "Number of Expressing Cells for Excluded Genes (10)": {
286
+ # "kind": "gene",
287
+ # "features": 10,
288
+ # "devpars": {"res": 100, "height": 1200, "width": 1200}
289
+ # },
290
+ },
255
291
  "use_sct": False,
256
292
  "no_integration": False,
257
293
  "NormalizeData": {},
@@ -338,6 +374,7 @@ class SeuratClustering(Proc):
338
374
  r-dplyr:
339
375
  - check: {{proc.lang}} <(echo "library(dplyr)")
340
376
  """ # noqa: E501
377
+
341
378
  input = "srtobj:file"
342
379
  output = "rdsfile:file:{{in.srtobj | stem}}.RDS"
343
380
  lang = config.lang.rscript
@@ -413,6 +450,7 @@ class SeuratSubClustering(Proc):
413
450
  Keys are the names of the cases and values are the dicts inherited from `envs` except `mutaters` and `cache`.
414
451
  If empty, a case with name `subcluster` will be created with default parameters.
415
452
  """ # noqa: E501
453
+
416
454
  input = "srtobj:file"
417
455
  output = "rdsfile:file:{{in.srtobj | stem}}.RDS"
418
456
  lang = config.lang.rscript
@@ -487,7 +525,10 @@ class SeuratClusterStats(Proc):
487
525
  srtobj: The seurat object loaded by `SeuratClustering`
488
526
 
489
527
  Output:
490
- outdir: The output directory
528
+ outdir: The output directory.
529
+ Different types of plots will be saved in different subdirectories.
530
+ For example, `clustree` plots will be saved in `clustrees` subdirectory.
531
+ For each case in `envs.clustrees`, both the png and pdf files will be saved.
491
532
 
492
533
  Envs:
493
534
  mutaters (type=json): The mutaters to mutate the metadata to subset the cells.
@@ -497,101 +538,41 @@ class SeuratClusterStats(Proc):
497
538
  - res (type=int): The resolution of the plots.
498
539
  - height (type=int): The height of the plots.
499
540
  - width (type=int): The width of the plots.
500
- - prefix: string indicating columns containing clustering information.
541
+ - more_formats (list): The formats to save the plots other than `png`.
542
+ - save_code (flag): Whether to save the code to reproduce the plot.
543
+ - prefix (type=auto): string indicating columns containing clustering information.
501
544
  The trailing dot is not necessary and will be added automatically.
502
- When `_auto`, clustrees will be plotted when there is `FindClusters` or
545
+ When `TRUE`, clustrees will be plotted when there is `FindClusters` or
503
546
  `FindClusters.*` in the `obj@commands`.
504
547
  The latter is generated by `SeuratSubClustering`.
505
- This will be ignored when `envs.clustrees` is specified.
506
- - <more>: Other arguments passed to `clustree::clustree()`.
507
- See <https://rdrr.io/cran/clustree/man/clustree.html>
548
+ This will be ignored when `envs.clustrees` is specified
549
+ (the prefix of each case must be specified separately).
550
+ - <more>: Other arguments passed to `scplotter::ClustreePlot`.
551
+ See <https://pwwang.github.io/scplotter/reference/ClustreePlot.html>
508
552
  clustrees (type=json): The cases for clustree plots.
509
553
  Keys are the names of the plots and values are the dicts inherited from `env.clustrees_defaults` except `prefix`.
510
554
  There is no default case for `clustrees`.
511
- hists_defaults (ns): The default parameters for histograms.
512
- This will plot histograms for the number of cells along `x`.
513
- For example, you can plot the number of cells along cell activity score.
514
- - x: The column name in metadata to plot as the x-axis.
515
- The NA values will be removed.
516
- It could be either numeric or factor/character.
517
- - x_order (list): The order of the x-axis, only works for factor/character `x`.
518
- You can also use it to subset `x` (showing only a subset values of `x`).
519
- - cells_by: A column name in metadata to group the cells.
520
- The NA values will be removed. It should be a factor/character.
521
- if not specified, all cells will be used.
522
- - cells_order (list): The order of the cell groups for the plots.
523
- It should be a list of strings. You can also use `cells_orderby` and `cells_n`
524
- to determine the order.
525
- - cells_orderby: An expression passed to `dplyr::arrange()` to order the cell groups.
526
- - cells_n: The number of cell groups to show.
527
- Ignored if `cells_order` is specified.
528
- - ncol (type=int): The number of columns for the plots, split by `cells_by`.
529
- - subset: An expression to subset the cells, will be passed to `dplyr::filter()`.
530
- - each: Whether to plot each group separately.
531
- - bins: The number of bins to use, only works for numeric `x`.
532
- - plus (list): The extra elements to add to the `ggplot` object.
533
- - devpars (ns): The device parameters for the plots.
534
- - res (type=int): The resolution of the plots.
535
- - height (type=int): The height of the plots.
536
- - width (type=int): The width of the plots.
537
- hists (type=json): The cases for histograms.
538
- Keys are the names of the plots and values are the dicts inherited from `env.hists_defaults`.
539
- There is no default case.
540
555
  stats_defaults (ns): The default parameters for `stats`.
541
- This is to do some basic statistics on the clusters. For more comprehensive analysis,
542
- see `RadarPlots` and `CellsDistribution`.
556
+ This is to do some basic statistics on the clusters/cells. For more comprehensive analysis,
557
+ see <https://pwwang.github.io/scplotter/reference/CellStatPlot.html>.
543
558
  The parameters from the cases can overwrite the default parameters.
544
- - frac (choice): How to calculate the fraction of cells.
545
- - group: calculate the fraction in each group.
546
- The total fraction of the cells of idents in each group will be 1.
547
- When `group-by` is not specified, it will be the same as `all`.
548
- - ident: calculate the fraction in each ident.
549
- The total fraction of the cells of groups in each ident will be 1.
550
- Only works when `group-by` is specified.
551
- - cluster: alias of `ident`.
552
- - all: calculate the fraction against all cells.
553
- - none: do not calculate the fraction, use the number of cells instead.
554
- - pie (flag): Also output a pie chart?
555
- - circos (flag): Also output a circos plot?
556
- - table (flag): Whether to output a table (in tab-delimited format) and in the report.
557
- - transpose (flag): Whether to transpose the cluster and group, that is,
558
- using group as the x-axis and cluster to fill the plot.
559
- For circos plot, when transposed, the arrows will be drawn from the idents (by `ident`) to the
560
- the groups (by `group-by`).
561
- Only works when `group-by` is specified.
562
- - position (choice): The position of the bars. Does not work for pie and circos plots.
563
- - stack: Use `position_stack()`.
564
- - fill: Use `position_fill()`.
565
- - dodge: Use `position_dodge()`.
566
- - auto: Use `stack` when there are more than 5 groups, otherwise use `dodge`.
567
- - ident: The column name in metadata to use as the identity.
568
- - group-by: The column name in metadata to group the cells.
569
- Does NOT support for pie charts.
570
- - split-by: The column name in metadata to split the cells into different plots.
571
- Does NOT support for circos plots.
572
- - subset: An expression to subset the cells, will be passed to
573
- `dplyr::filter()` on metadata.
574
- - circos_labels_rot (flag): Whether to rotate the labels in the circos plot.
575
- In case the labels are too long.
576
- - circos_devpars (ns): The device parameters for the circos plots.
577
- - res (type=int): The resolution of the plots.
578
- - height (type=int): The height of the plots.
579
- - width (type=int): The width of the plots.
580
- - pie_devpars (ns): The device parameters for the pie charts.
581
- - res (type=int): The resolution of the plots.
582
- - height (type=int): The height of the plots.
583
- - width (type=int): The width of the plots.
584
- - devpars (ns): The device parameters for the plots.
559
+ - subset: An expression to subset the cells, will be passed to `tidyrseurat::filter()`.
560
+ - devpars (ns): The device parameters for the clustree plot.
585
561
  - res (type=int): The resolution of the plots.
586
562
  - height (type=int): The height of the plots.
587
563
  - width (type=int): The width of the plots.
564
+ - more_formats (list): The formats to save the plots other than `png`.
565
+ - save_code (flag): Whether to save the code to reproduce the plot.
566
+ - save_data (flag): Whether to save the data used to generate the plot.
567
+ - <more>: Other arguments passed to `scplotter::CellStatPlot`.
568
+ See <https://pwwang.github.io/scplotter/reference/CellStatPlot.html>.
588
569
  stats (type=json): The number/fraction of cells to plot.
589
570
  Keys are the names of the plots and values are the dicts inherited from `env.stats_defaults`.
590
571
  Here are some examples -
591
572
  >>> {
592
573
  >>> "nCells_All": {},
593
- >>> "nCells_Sample": {"group-by": "Sample"},
594
- >>> "fracCells_Sample": {"frac": True, "group-by": "Sample"},
574
+ >>> "nCells_Sample": {"group_by": "Sample"},
575
+ >>> "fracCells_Sample": {"scale_y": True, "group_by": "Sample", plot_type = "pie"},
595
576
  >>> }
596
577
  ngenes_defaults (ns): The default parameters for `ngenes`.
597
578
  The default parameters to plot the number of genes expressed in each cell.
@@ -610,61 +591,30 @@ class SeuratClusterStats(Proc):
610
591
  - features: The features to plot.
611
592
  It can be either a string with comma separated features, a list of features, a file path with `file://` prefix with features
612
593
  (one per line), or an integer to use the top N features from `VariantFeatures(srtobj)`.
613
- - ident: The column name in metadata to use as the identity.
614
- If it is from subclustering (reduction `sub_umap_<ident>` exists), the reduction will be used.
615
- - cluster_orderby (type=auto): The order of the clusters to show on the plot.
616
- An expression passed to `dplyr::summarise()` on the grouped data frame (by `seurat_clusters`).
617
- The summary stat will be passed to `dplyr::arrange()` to order the clusters. It's applied on the whole meta.data before grouping and subsetting.
594
+ - order_by (type=auto): The order of the clusters to show on the plot.
595
+ An expression passed to `dplyr::arrange()` on the grouped meta data frame (by `ident`).
618
596
  For example, you can order the clusters by the activation score of
619
597
  the cluster: `desc(mean(ActivationScore, na.rm = TRUE))`, suppose you have a column
620
598
  `ActivationScore` in the metadata.
621
- You may also specify the literal order of the clusters by a list of strings.
599
+ You may also specify the literal order of the clusters by a list of strings (at least two).
622
600
  - subset: An expression to subset the cells, will be passed to `tidyrseurat::filter()`.
623
601
  - devpars (ns): The device parameters for the plots. Does not work for `table`.
624
602
  - res (type=int): The resolution of the plots.
625
603
  - height (type=int): The height of the plots.
626
604
  - width (type=int): The width of the plots.
627
- - plus: The extra elements to add to the `ggplot` object. Does not work for `table`.
628
- - group-by: Group cells in different ways (for example, orig.ident). Works for `ridge`, `vln`, and `dot`.
629
- It also works for `feature` as `shape.by` being passed to [`Seurat::FeaturePlot`](https://satijalab.org/seurat/reference/featureplot).
630
- - split-by: The column name in metadata to split the cells into different plots.
631
- It works for `vln`, `feature`, and `dot`.
632
- - assay: The assay to use.
633
- - layer: The layer to use.
634
- - reduction: The reduction to use. Only works for `feature`.
635
- - section: The section to put the plot in the report.
636
- If not specified, the case title will be used.
637
- - ncol (type=int): The number of columns for the plots.
638
- - kind (choice): The kind of the plot or table.
639
- - ridge: Use `Seurat::RidgePlot`.
640
- - ridgeplot: Same as `ridge`.
641
- - vln: Use `Seurat::VlnPlot`.
642
- - vlnplot: Same as `vln`.
643
- - violin: Same as `vln`.
644
- - violinplot: Same as `vln`.
645
- - feature: Use `Seurat::FeaturePlot`.
646
- - featureplot: Same as `feature`.
647
- - dot: Use `Seurat::DotPlot`.
648
- - dotplot: Same as `dot`.
649
- - bar: Bar plot on an aggregated feature.
650
- The features must be a single feature, which will be either an existing feature or an expression
651
- passed to `dplyr::summarise()` (grouped by `ident`) on the existing features to create a new feature.
652
- - barplot: Same as `bar`.
653
- - heatmap: Use `Seurat::DoHeatmap`.
654
- - avgheatmap: Plot the average expression of the features in each cluster as a heatmap.
655
- - table: The table for the features, only gene expressions are supported.
656
- (supported keys: ident, subset, and features).
605
+ - descr: The description of the plot, showing in the report.
606
+ - more_formats (list): The formats to save the plots other than `png`.
607
+ - save_code (flag): Whether to save the code to reproduce the plot.
608
+ - save_data (flag): Whether to save the data used to generate the plot.
609
+ - <more>: Other arguments passed to `scplotter::FeatureStatPlot`.
610
+ See <https://pwwang.github.io/scplotter/reference/FeatureStatPlot.html>
657
611
  features (type=json): The plots for features, include gene expressions, and columns from metadata.
658
- Keys are the titles of the cases and values are the dicts inherited from `env.features_defaults`. It can also have other parameters from
659
- each Seurat function used by `kind`. Note that for argument name with `.`, you should use `-` instead.
612
+ Keys are the titles of the cases and values are the dicts inherited from `env.features_defaults`.
660
613
  dimplots_defaults (ns): The default parameters for `dimplots`.
661
- - ident: The identity to use.
614
+ - group_by: The identity to use.
662
615
  If it is from subclustering (reduction `sub_umap_<ident>` exists), this reduction will be used if `reduction`
663
616
  is set to `dim` or `auto`.
664
- - group-by: Same as `ident` if not specified, to define how the points are colored.
665
- - na_group: The group name for NA values, use `None` to ignore NA values.
666
- - split-by: The column name in metadata to split the cells into different plots.
667
- - shape-by: The column name in metadata to use as the shape.
617
+ - split_by: The column name in metadata to split the cells into different plots.
668
618
  - subset: An expression to subset the cells, will be passed to `tidyrseurat::filter()`.
669
619
  - devpars (ns): The device parameters for the plots.
670
620
  - res (type=int): The resolution of the plots.
@@ -678,64 +628,42 @@ class SeuratClusterStats(Proc):
678
628
  - umap: Use `Seurat::UMAPPlot`.
679
629
  - tsne: Use `Seurat::TSNEPlot`.
680
630
  - pca: Use `Seurat::PCAPlot`.
681
- - <more>: See <https://satijalab.org/seurat/reference/dimplot>
631
+ - <more>: See <https://pwwang.github.io/scplotter/reference/CellDimPlot.html>
682
632
  dimplots (type=json): The dimensional reduction plots.
683
633
  Keys are the titles of the plots and values are the dicts inherited from `env.dimplots_defaults`. It can also have other parameters from
684
- [`Seurat::DimPlot`](https://satijalab.org/seurat/reference/dimplot).
634
+ [`scplotter::CellDimPlot`](https://pwwang.github.io/scplotter/reference/CellDimPlot.html).
685
635
 
686
636
  Requires:
687
637
  r-seurat:
688
638
  - check: {{proc.lang}} -e "library(Seurat)"
689
639
  """ # noqa: E501
640
+
690
641
  input = "srtobj:file"
691
642
  output = "outdir:dir:{{in.srtobj | stem}}.cluster_stats"
692
643
  lang = config.lang.rscript
693
644
  envs = {
694
645
  "mutaters": {},
695
646
  "clustrees_defaults": {
696
- "devpars": {"res": 100, "height": 1000, "width": 800},
697
- "prefix": "_auto",
647
+ "devpars": {"res": 100},
648
+ "more_formats": [],
649
+ "save_code": False,
650
+ "prefix": True,
698
651
  },
699
652
  "clustrees": {},
700
- "hists_defaults": {
701
- "x": None,
702
- "x_order": [],
703
- "cells_by": None,
704
- "cells_order": [],
705
- "cells_orderby": None,
706
- "cells_n": 10,
707
- "subset": None,
708
- "ncol": 2,
709
- "each": None,
710
- "bins": 30,
711
- "plus": [],
712
- "devpars": {"res": 100, "height": None, "width": None},
713
- },
714
- "hists": {},
715
653
  "stats_defaults": {
716
- "frac": "none",
717
- "pie": False,
718
- "circos": False,
719
- "table": False,
720
- "position": "auto",
721
- "transpose": False,
722
- "ident": "seurat_clusters",
723
- "group-by": None,
724
- "split-by": None,
725
654
  "subset": None,
726
- "circos_labels_rot": False,
727
- "devpars": {"res": 100, "height": 600, "width": 800},
728
- "pie_devpars": {"res": 100, "height": 600, "width": 800},
729
- "circos_devpars": {"res": 100, "height": 600, "width": 600},
655
+ "devpars": {"res": 100},
656
+ "more_formats": [],
657
+ "save_code": False,
658
+ "save_data": False,
730
659
  },
731
660
  "stats": {
732
- "Number of cells in each cluster": {
733
- "pie": True,
661
+ "Number of cells in each cluster (Bar Chart)": {
662
+ "plot_type": "bar",
734
663
  },
735
- "Number of cells in each cluster by Sample": {
736
- "group-by": "Sample",
737
- "table": True,
738
- "frac": "group",
664
+ "Number of cells in each cluster by Sample (Bar Chart)": {
665
+ "plot_type": "bar",
666
+ "group_by": "Sample",
739
667
  },
740
668
  },
741
669
  "ngenes_defaults": {
@@ -750,43 +678,31 @@ class SeuratClusterStats(Proc):
750
678
  },
751
679
  "features_defaults": {
752
680
  "features": None,
753
- "ident": "seurat_clusters",
754
- "cluster_orderby": None,
681
+ "order_by": None,
755
682
  "subset": None,
756
683
  "devpars": {"res": 100},
757
- "plus": None,
758
- "group-by": None,
759
- "split-by": None,
760
- "assay": None,
761
- "section": None,
762
- "layer": None,
763
- "reduction": None,
764
- "kind": None,
765
- "ncol": 2,
684
+ "descr": None,
685
+ "more_formats": [],
686
+ "save_code": False,
687
+ "save_data": False,
766
688
  },
767
689
  "features": {},
768
690
  "dimplots_defaults": {
769
- "ident": "seurat_clusters",
770
- "group-by": None,
771
- "na_group": None,
772
- "split-by": None,
773
- "shape-by": None,
691
+ "group_by": "seurat_clusters",
692
+ "split_by": None,
774
693
  "subset": None,
775
694
  "reduction": "dim",
776
- "devpars": {"res": 100, "height": 800, "width": 1000},
695
+ "devpars": {"res": 100},
777
696
  },
778
697
  "dimplots": {
779
698
  "Dimensional reduction plot": {
780
699
  "label": True,
781
- "label-box": True,
782
- "repel": True,
700
+ "label_insitu": True,
783
701
  },
784
702
  },
785
703
  }
786
704
  script = "file://../scripts/scrna/SeuratClusterStats.R"
787
- plugin_opts = {
788
- "report": "file://../reports/scrna/SeuratClusterStats.svelte"
789
- }
705
+ plugin_opts = {"report": "file://../reports/scrna/SeuratClusterStats.svelte"}
790
706
 
791
707
 
792
708
  class ModuleScoreCalculator(Proc):
@@ -806,7 +722,7 @@ class ModuleScoreCalculator(Proc):
806
722
  srtobj: The seurat object loaded by `SeuratClustering`
807
723
 
808
724
  Output:
809
- rdsfile: The seurat object with module scores
725
+ rdsfile: The seurat object with module scores added to the metadata.
810
726
 
811
727
  Envs:
812
728
  defaults (ns): The default parameters for `modules`.
@@ -863,6 +779,7 @@ class ModuleScoreCalculator(Proc):
863
779
  This requires [`SingleCellExperiment`](https://bioconductor.org/packages/release/bioc/html/SingleCellExperiment.html)
864
780
  and [`destiny`](https://bioconductor.org/packages/release/bioc/html/destiny.html) R packages.
865
781
  """ # noqa: E501
782
+
866
783
  input = "srtobj:file"
867
784
  output = "rdsfile:file:{{in.srtobj | stem}}.RDS"
868
785
  lang = config.lang.rscript
@@ -922,7 +839,8 @@ class CellsDistribution(Proc):
922
839
  srtobj: The seurat object in RDS format
923
840
 
924
841
  Output:
925
- outdir: The output directory
842
+ outdir: The output directory.
843
+ The results for each case will be saved in a subdirectory.
926
844
 
927
845
  Envs:
928
846
  mutaters (type=json): The mutaters to mutate the metadata
@@ -988,6 +906,7 @@ class CellsDistribution(Proc):
988
906
  r-tidyr:
989
907
  - check: {{proc.lang}} -e "library(tidyr)"
990
908
  """ # noqa: E501
909
+
991
910
  input = "srtobj:file"
992
911
  output = "outdir:dir:{{in.srtobj | stem}}.cells_distribution"
993
912
  lang = config.lang.rscript
@@ -1043,6 +962,7 @@ class SeuratMetadataMutater(Proc):
1043
962
  r-dplyr:
1044
963
  - check: {{proc.lang}} <(echo "library(dplyr)")
1045
964
  """ # noqa: E501
965
+
1046
966
  input = "srtobj:file, metafile:file"
1047
967
  output = "rdsfile:file:{{in.srtobj | stem}}.RDS"
1048
968
  lang = config.lang.rscript
@@ -1067,6 +987,7 @@ class DimPlots(Proc):
1067
987
  Keys are the names and values are the arguments to
1068
988
  `Seurat::Dimplots`
1069
989
  """
990
+
1070
991
  input = "srtobj:file, configfile:file, name:var"
1071
992
  output = "outdir:dir:{{in.srtobj | stem}}.dimplots"
1072
993
  lang = config.lang.rscript
@@ -1080,7 +1001,6 @@ class DimPlots(Proc):
1080
1001
 
1081
1002
  @format_placeholder(
1082
1003
  mutate_helpers_clonesize=MUTATE_HELPERS_CLONESIZE_INDENTED,
1083
- envs_section_each=ENVS_SECTION_EACH_INDENTED,
1084
1004
  )
1085
1005
  class MarkersFinder(Proc):
1086
1006
  """Find markers between different groups of cells
@@ -1099,7 +1019,7 @@ class MarkersFinder(Proc):
1099
1019
  by `PrepSCTFindMarkers` if data is not normalized using `SCTransform`.
1100
1020
 
1101
1021
  Output:
1102
- outdir: The output directory for the markers
1022
+ outdir: The output directory for the markers and plots
1103
1023
 
1104
1024
  Envs:
1105
1025
  ncores (type=int): Number of cores to use for parallel computing for some `Seurat` procedures.
@@ -1131,73 +1051,104 @@ class MarkersFinder(Proc):
1131
1051
  to select markers with adjusted p-value < 0.05 and absolute log2
1132
1052
  fold change > 1.
1133
1053
  assay: The assay to use.
1134
- volcano_genes (type=auto): The genes to label in the volcano plot if they are
1135
- significant markers.
1136
- If `True`, all significant markers will be labeled. If `False`, no
1137
- genes will be labeled. Otherwise, specify the genes to label.
1138
- It could be either a string with comma separated genes, or a list
1139
- of genes.
1140
- section: The section name for the report. It must not contain colon (`:`).
1141
- Ignored when `each` is not specified and `ident-1` is specified.
1142
- When neither `each` nor `ident-1` is specified, case name will be used
1143
- as section name.
1144
- If `each` is specified, the section name will be constructed from
1145
- `each` and case name.
1146
- %(envs_section_each)s
1054
+ error (flag): Error out if no/not enough markers are found or no pathways are enriched.
1055
+ If `False`, empty results will be returned.
1056
+ site: The site to use for the `enrichR` enrichment analysis.
1147
1057
  subset: An expression to subset the cells for each case.
1058
+ cache (type=auto): Where to cache to `FindAllMarkers` results.
1059
+ If `True`, cache to `outdir` of the job. If `False`, don't cache.
1060
+ Otherwise, specify the directory to cache to.
1148
1061
  rest (ns): Rest arguments for `Seurat::FindMarkers()`.
1149
1062
  Use `-` to replace `.` in the argument name. For example,
1150
1063
  use `min-pct` instead of `min.pct`.
1151
1064
  This only works when `use_presto` is `False`.
1152
1065
  - <more>: See <https://satijalab.org/seurat/reference/findmarkers>
1153
- dotplot (ns): Arguments for `Seurat::DotPlot()`.
1154
- Use `-` to replace `.` in the argument name. For example,
1155
- use `group-bar` instead of `group.bar`.
1156
- Note that `object`, `features`, and `group-by` are already specified
1157
- by this process. So you don't need to specify them here.
1158
- - maxgenes (type=int): The maximum number of genes to plot.
1066
+ allmarker_plots_defaults (ns): Default options for the plots for all markers when `ident-1` is not specified.
1067
+ - plot_type: The type of the plot.
1068
+ See <https://pwwang.github.io/scplotter/reference/FeatureStatPlot.html>.
1069
+ Available types are `violin`, `box`, `bar`, `ridge`, `dim`, `heatmap` and `dot`.
1070
+ - more_formats (list): The extra formats to save the plot in.
1071
+ - save_code (flag): Whether to save the code to generate the plot.
1159
1072
  - devpars (ns): The device parameters for the plots.
1160
1073
  - res (type=int): The resolution of the plots.
1161
1074
  - height (type=int): The height of the plots.
1162
1075
  - width (type=int): The width of the plots.
1163
- - <more>: See <https://satijalab.org/seurat/reference/doheatmap>
1164
- cases (type=json): If you have multiple cases, you can specify them
1165
- here. The keys are the names of the cases and the values are the
1166
- above options except `ncores` and `mutaters`. If some options are
1167
- not specified, the default values specified above will be used.
1168
- If no cases are specified, the default case will be added with
1169
- the default values under `envs` with the name `DEFAULT`.
1170
- overlap_defaults (ns): The default options for overlapping analysis.
1076
+ - order_by: an expression to order the markers, passed by `dplyr::arrange()`.
1077
+ - genes: The number of top genes to show or an expression passed to `dplyr::filter()` to filter the genes.
1078
+ - <more>: Other arguments passed to [`scplotter::FeatureStatPlot()`](https://pwwang.github.io/scplotter/reference/FeatureStatPlot.html).
1079
+ allmarker_plots (type=json): All marker plot cases.
1080
+ The keys are the names of the cases and the values are the dicts inherited from `allmarker_plots_defaults`.
1081
+ marker_plots_defaults (ns): Default options for the plots to generate for the markers.
1082
+ - plot_type: The type of the plot.
1083
+ See <https://pwwang.github.io/scplotter/reference/FeatureStatPlot.html>.
1084
+ Available types are `violin`, `box`, `bar`, `ridge`, `dim`, `heatmap` and `dot`.
1085
+ There are two additional types available - `volcano_pct` and `volcano_log2fc`.
1086
+ - more_formats (list): The extra formats to save the plot in.
1087
+ - save_code (flag): Whether to save the code to generate the plot.
1088
+ - devpars (ns): The device parameters for the plots.
1089
+ - res (type=int): The resolution of the plots.
1090
+ - height (type=int): The height of the plots.
1091
+ - width (type=int): The width of the plots.
1092
+ - order_by: an expression to order the markers, passed by `dplyr::arrange()`.
1093
+ - genes: The number of top genes to show or an expression passed to `dplyr::filter()` to filter the genes.
1094
+ - <more>: Other arguments passed to [`scplotter::FeatureStatPlot()`](https://pwwang.github.io/scplotter/reference/FeatureStatPlot.html).
1095
+ If `plot_type` is `volcano_pct` or `volcano_log2fc`, they will be passed to
1096
+ [`scplotter::VolcanoPlot()`](https://pwwang.github.io/plotthis/reference/VolcanoPlot.html).
1097
+ marker_plots (type=json): Cases of the plots to generate for the markers.
1098
+ Plot cases. The keys are the names of the cases and the values are the dicts inherited from `marker_plots_defaults`.
1099
+ enrich_plots_defaults (ns): Default options for the plots to generate for the enrichment analysis.
1100
+ - plot_type: The type of the plot.
1101
+ See <https://pwwang.github.io/scplotter/reference/EnrichmentPlot.html>.
1102
+ Available types are `bar`, `dot`, `lollipop`, `network`, `enrichmap` and `wordcloud`.
1103
+ - more_formats (list): The extra formats to save the plot in.
1104
+ - save_code (flag): Whether to save the code to generate the plot.
1105
+ - devpars (ns): The device parameters for the plots.
1106
+ - res (type=int): The resolution of the plots.
1107
+ - height (type=int): The height of the plots.
1108
+ - width (type=int): The width of the plots.
1109
+ - <more>: See <https://pwwang.github.io/scplotter/reference/EnrichmentPlot.htmll>.
1110
+ enrich_plots (type=json): Cases of the plots to generate for the enrichment analysis.
1111
+ The keys are the names of the cases and the values are the dicts inherited from `enrich_plots_defaults`.
1112
+ cases (type=json): If you have multiple cases for marker discovery, you can specify them
1113
+ here. The keys are the names of the cases and the values are the above options. If some options are
1114
+ not specified, the default values specified above (under `envs`) will be used.
1115
+ If no cases are specified, the default case will be added with the default values under `envs` with the name `DEFAULT`.
1116
+ If you want to put some cases under the same section in the report, you can specify the section name in the case name
1117
+ as a prefix separated by `::`. For example, `section1::case1` and `section1::case2` will be put `case1` and `case2`
1118
+ under the section `section1`.
1119
+ overlaps_defaults (ns): Default options for investigating the overlapping of significant markers between different cases.
1120
+ - cases (list): The cases to do the overlapping analysis, including the prefix section name.
1121
+ The case must have `ident-1` specified. When `each` is specified, the case will be expanded.
1122
+ For example, `case1` with `each = "group"`, where `group` has `g1` and `g2`, will be expanded to
1123
+ `case1::g1` and `case1::g2`, or `case1::group - g1` and `case1::group - g2` if `prefix_each` is `True`.
1124
+ There must be at least 2 cases to do the overlapping analysis.
1125
+ - sigmarkers: The expression to filter the significant markers for each case.
1126
+ If not provided, `envs.sigmarkers` will be used.
1171
1127
  - venn (ns): The options for the Venn diagram.
1172
- Venn diagram can only be plotted for sections with no more than 4 cases.
1128
+ - enabled (flag): Whether to enable the Venn diagram.
1129
+ Default is "auto", which means enabled when there are no more than 5 cases.
1130
+ - more_formats (list): The extra formats to save the plot in.
1131
+ - save_code (flag): Whether to save the code to generate the plot.
1173
1132
  - devpars (ns): The device parameters for the plots.
1174
1133
  - res (type=int): The resolution of the plots.
1175
1134
  - height (type=int): The height of the plots.
1176
1135
  - width (type=int): The width of the plots.
1136
+ - <more>: More arguments pased to `plotthis::VennDiagram()`.
1137
+ https://pwwang.github.io/plotthis/reference/venndiagram1.html
1177
1138
  - upset (ns): The options for the UpSet plot.
1139
+ - enabled (flag): Whether to enable the UpSet plot.
1140
+ - more_formats (list): The extra formats to save the plot in.
1141
+ - save_code (flag): Whether to save the code to generate the plot.
1178
1142
  - devpars (ns): The device parameters for the plots.
1179
1143
  - res (type=int): The resolution of the plots.
1180
1144
  - height (type=int): The height of the plots.
1181
1145
  - width (type=int): The width of the plots.
1182
- overlap (json): The sections to do overlaping analysis, including
1183
- Venn diagram and UpSet plot. The Venn diagram and UpSet plot
1184
- will be plotted for the overlapping of significant markers between
1185
- different cases.
1186
- The keys of this option are the names of the sections. The values are
1187
- a dict of options with keys `venn` and `upset`, values will
1188
- be inherited from `envs.overlap_defaults`, recursively.
1189
- You can set `envs.overlap.<section>.venn` to `False`/`None` to disable
1190
- the Venn diagram for the section.
1191
- It works when `each` is specified. In such a case, the sections will be
1192
- the case names.
1193
- This does not work for the cases where `ident-1` is not specified. In case
1194
- you want to do such analysis for those cases, you should enumerate the
1195
- idents in different cases and specify them here.
1196
- cache (type=auto): Where to cache to `FindAllMarkers` results.
1197
- If `True`, cache to `outdir` of the job. If `False`, don't cache.
1198
- Otherwise, specify the directory to cache to.
1199
- Only works when `use_presto` is `False` (presto works fast enough).
1146
+ - <more>: More arguments pased to `plotthis::UpsetPlot()`.
1147
+ https://pwwang.github.io/plotthis/reference/upsetplot1.html
1148
+ overlaps (type=json): Cases for investigating the overlapping of significant markers between different cases.
1149
+ The keys are the names of the cases and the values are the dicts inherited from `overlaps_defaults`.
1200
1150
  """ # noqa: E501
1151
+
1201
1152
  input = "srtobj:file"
1202
1153
  output = "outdir:dir:{{in.srtobj | stem0}}.markers"
1203
1154
  lang = config.lang.rscript
@@ -1210,21 +1161,62 @@ class MarkersFinder(Proc):
1210
1161
  "each": None,
1211
1162
  "prefix_each": True,
1212
1163
  "prefix_group": True,
1213
- "section": "DEFAULT",
1214
1164
  "assay": None,
1215
1165
  "subset": None,
1166
+ "error": True,
1167
+ "site": "Enrichr",
1216
1168
  "rest": {},
1217
1169
  "dbs": ["KEGG_2021_Human", "MSigDB_Hallmark_2020"],
1218
1170
  "sigmarkers": "p_val_adj < 0.05",
1219
- "volcano_genes": True,
1220
- "dotplot": {"maxgenes": 20},
1171
+ "cache": config.path.tmpdir,
1172
+ "allmarker_plots_defaults": {
1173
+ "plot_type": None,
1174
+ "more_formats": [],
1175
+ "save_code": False,
1176
+ "devpars": {"res": 100},
1177
+ "order_by": "desc(abs(avg_log2FC))",
1178
+ "genes": 10,
1179
+ },
1180
+ "allmarker_plots": {},
1181
+ "marker_plots_defaults": {
1182
+ "plot_type": None,
1183
+ "more_formats": [],
1184
+ "save_code": False,
1185
+ "devpars": {"res": 100},
1186
+ "order_by": "desc(abs(avg_log2FC))",
1187
+ "genes": 10,
1188
+ },
1189
+ "marker_plots": {
1190
+ "Volcano Plot (diff_pct)": {"plot_type": "volcano_pct"},
1191
+ "Volcano Plot (log2FC)": {"plot_type": "volcano_log2fc"},
1192
+ "Dot Plot": {"plot_type": "dot"},
1193
+ },
1194
+ "enrich_plots_defaults": {
1195
+ "more_formats": [],
1196
+ "save_code": False,
1197
+ "devpars": {"res": 100},
1198
+ },
1199
+ "enrich_plots": {
1200
+ "Bar Plot": {"plot_type": "bar", "ncol": 1, "top_term": 10},
1201
+ },
1221
1202
  "cases": {},
1222
- "overlap_defaults": {
1223
- "venn": {"devpars": {"res": 100, "height": 600, "width": 1000}},
1224
- "upset": {"devpars": {"res": 100, "height": 600, "width": 800}},
1203
+ "overlaps_defaults": {
1204
+ "cases": [],
1205
+ "sigmarkers": None,
1206
+ "venn": {
1207
+ "enabled": "auto",
1208
+ "more_formats": [],
1209
+ "save_code": False,
1210
+ "devpars": {"res": 100},
1211
+ },
1212
+ "upset": {
1213
+ "enabled": True,
1214
+ "more_formats": [],
1215
+ "save_code": False,
1216
+ "devpars": {"res": 100},
1217
+ },
1225
1218
  },
1226
- "overlap": {},
1227
- "cache": config.path.tmpdir,
1219
+ "overlaps": {},
1228
1220
  }
1229
1221
  order = 5
1230
1222
  script = "file://../scripts/scrna/MarkersFinder.R"
@@ -1274,6 +1266,7 @@ class TopExpressingGenes(Proc):
1274
1266
  If no cases are specified, the default case will be added with
1275
1267
  the default values under `envs` with the name `DEFAULT`.
1276
1268
  """
1269
+
1277
1270
  input = "srtobj:file"
1278
1271
  output = "outdir:dir:{{in.srtobj | stem}}.top_expressing_genes"
1279
1272
  lang = config.lang.rscript
@@ -1358,6 +1351,7 @@ class ExprImputation(Proc):
1358
1351
  - if: {{proc.envs.tool == "alra"}}
1359
1352
  - check: {{proc.lang}} <(echo "library(SeuratWrappers)")
1360
1353
  """ # noqa: E501
1354
+
1361
1355
  input = "infile:file"
1362
1356
  output = "outfile:file:{{in.infile | stem}}.imputed.RDS"
1363
1357
  lang = config.lang.rscript
@@ -1393,10 +1387,10 @@ class SCImpute(Proc):
1393
1387
  infmt: The input format.
1394
1388
  Either `seurat` or `matrix`
1395
1389
  """
1390
+
1396
1391
  input = "infile:file, groupfile:file"
1397
1392
  output = [
1398
- "outfile:file:{{in.infile | stem | replace: '.seurat', ''}}."
1399
- "{{envs.outfmt}}"
1393
+ "outfile:file:{{in.infile | stem | replace: '.seurat', ''}}." "{{envs.outfmt}}"
1400
1394
  ]
1401
1395
  lang = config.lang.rscript
1402
1396
  envs = {
@@ -1434,6 +1428,7 @@ class SeuratFilter(Proc):
1434
1428
  r-dplyr:
1435
1429
  - check: {{proc.lang}} <(echo "library('dplyr')")
1436
1430
  """
1431
+
1437
1432
  input = "srtobj:file, filters:var"
1438
1433
  output = "outfile:file:{{in.srtobj | stem}}.filtered.RDS"
1439
1434
  lang = config.lang.rscript
@@ -1468,6 +1463,7 @@ class SeuratSubset(Proc):
1468
1463
  r-dplyr:
1469
1464
  - check: {{proc.lang}} <(echo "library('dplyr')")
1470
1465
  """
1466
+
1471
1467
  input = "srtobj:file, subsets:var"
1472
1468
  output = "outdir:dir:{{in.srtobj | stem}}.subsets"
1473
1469
  envs = {"ignore_nas": True}
@@ -1491,6 +1487,7 @@ class SeuratSplit(Proc):
1491
1487
  recell: Rename the cell ids using the `by` column
1492
1488
  A string of R function taking the original cell ids and `by`
1493
1489
  """
1490
+
1494
1491
  input = "srtobj:file, by:var"
1495
1492
  output = "outdir:dir:{{in.srtobj | stem}}.subsets"
1496
1493
  envs = {
@@ -1521,6 +1518,7 @@ class Subset10X(Proc):
1521
1518
  feats_to_keep: The features/genes to keep.
1522
1519
  The final features list will be `feats_to_keep` + `nfeats`
1523
1520
  """
1521
+
1524
1522
  input = "indir:dir"
1525
1523
  output = "outdir:dir:{{in.indir | stem}}"
1526
1524
  envs = {
@@ -1550,6 +1548,7 @@ class SeuratTo10X(Proc):
1550
1548
  Envs:
1551
1549
  version: The version of 10X format
1552
1550
  """
1551
+
1553
1552
  input = "srtobj:file"
1554
1553
  output = "outdir:dir:{{in.srtobj | stem}}"
1555
1554
  envs = {"version": "3", "split_by": None}
@@ -1582,7 +1581,7 @@ class ScFGSEA(Proc):
1582
1581
  srtobj: The seurat object in RDS format
1583
1582
 
1584
1583
  Output:
1585
- outdir: The output directory for the results
1584
+ outdir: The output directory for the results and plots
1586
1585
 
1587
1586
  Envs:
1588
1587
  ncores (type=int): Number of cores for parallelization
@@ -1638,6 +1637,7 @@ class ScFGSEA(Proc):
1638
1637
  r-seurat:
1639
1638
  - check: {{proc.lang}} -e "library(seurat)"
1640
1639
  """ # noqa: E501
1640
+
1641
1641
  input = "srtobj:file"
1642
1642
  output = "outdir:dir:{{(in.casefile or in.srtobj) | stem0}}.fgsea"
1643
1643
  lang = config.lang.rscript
@@ -1704,7 +1704,9 @@ class CellTypeAnnotation(Proc):
1704
1704
  sobjfile: The seurat object
1705
1705
 
1706
1706
  Output:
1707
- outfile: The rds file of seurat object with cell type annotated
1707
+ outfile: The rds file of seurat object with cell type annotated.
1708
+ A text file containing the mapping from the old `seurat_clusters` to the new cell types
1709
+ will be generated and saved to `cluster2celltype.tsv` under the job output directory.
1708
1710
 
1709
1711
  Envs:
1710
1712
  tool (choice): The tool to use for cell type annotation.
@@ -1788,6 +1790,7 @@ class CellTypeAnnotation(Proc):
1788
1790
  - if: {{proc.envs.tool == 'sctype'}}
1789
1791
  - check: {{proc.lang}} -e "library(openxlsx)"
1790
1792
  """ # noqa: E501
1793
+
1791
1794
  input = "sobjfile:file"
1792
1795
  output = (
1793
1796
  "outfile:file:"
@@ -1905,6 +1908,7 @@ class SeuratMap2Ref(Proc):
1905
1908
  r-seurat:
1906
1909
  - check: {{proc.lang}} -e "library(Seurat)"
1907
1910
  """ # noqa: E501
1911
+
1908
1912
  input = "sobjfile:file"
1909
1913
  output = "outfile:file:{{in.sobjfile | stem}}.RDS"
1910
1914
  lang = config.lang.rscript
@@ -1935,7 +1939,7 @@ class SeuratMap2Ref(Proc):
1935
1939
  # "celltype-l1": "celltype.l1",
1936
1940
  # "celltype-l2": "celltype.l2",
1937
1941
  # "predicted_ADT": "ADT",
1938
- }
1942
+ },
1939
1943
  },
1940
1944
  "MappingScore": {"ndim": 30},
1941
1945
  }
@@ -2083,6 +2087,7 @@ class RadarPlots(Proc):
2083
2087
  key `DEFAULT`.
2084
2088
  The keys must be valid string as part of the file name.
2085
2089
  """ # noqa: E501
2090
+
2086
2091
  input = "srtobj:file"
2087
2092
  output = "outdir:dir:{{in.srtobj | stem}}.radar_plots"
2088
2093
  lang = config.lang.rscript
@@ -2093,7 +2098,7 @@ class RadarPlots(Proc):
2093
2098
  "each": None,
2094
2099
  "prefix_each": True,
2095
2100
  "order": None,
2096
- "colors": None,
2101
+ "colors": "biopipen",
2097
2102
  "ident": "seurat_clusters",
2098
2103
  "cluster_order": [],
2099
2104
  "breakdown": None,
@@ -2193,6 +2198,7 @@ class MetaMarkers(Proc):
2193
2198
  If no cases are specified, the default case will be added with
2194
2199
  the default values under `envs` with the name `DEFAULT`.
2195
2200
  """ # noqa: E501
2201
+
2196
2202
  input = "srtobj:file"
2197
2203
  output = "outdir:dir:{{in.srtobj | stem}}.meta_markers"
2198
2204
  lang = config.lang.rscript
@@ -2231,6 +2237,7 @@ class Seurat2AnnData(Proc):
2231
2237
  assay: The assay to use for AnnData.
2232
2238
  If not specified, the default assay will be used.
2233
2239
  """
2240
+
2234
2241
  input = "sobjfile:file"
2235
2242
  output = "outfile:file:{{in.sobjfile | stem}}.h5ad"
2236
2243
  lang = config.lang.rscript
@@ -2260,6 +2267,7 @@ class AnnData2Seurat(Proc):
2260
2267
  to use for the check.
2261
2268
  Only works for `outtype = 'rds'`.
2262
2269
  """
2270
+
2263
2271
  input = "adfile:file"
2264
2272
  output = "outfile:file:{{in.adfile | stem}}.RDS"
2265
2273
  lang = config.lang.rscript
@@ -2302,6 +2310,7 @@ class ScSimulation(Proc):
2302
2310
  See <https://rdrr.io/bioc/splatter/man/SplatParams.html>.
2303
2311
  Hyphens (`-`) will be transformed into dots (`.`) for the keys.
2304
2312
  """ # noqa: E501
2313
+
2305
2314
  input = "seed:var"
2306
2315
  output = "outfile:file:simulatied_{{in.seed}}.RDS"
2307
2316
  lang = config.lang.rscript
@@ -2348,7 +2357,7 @@ class CellCellCommunication(Proc):
2348
2357
  expression, while *_complex corresponds to the actual complex, with subunits being separated by _.
2349
2358
  source and target columns represent the source/sender and target/receiver cell identity for each interaction, respectively
2350
2359
  * `*_props`: represents the proportion of cells that express the entity.
2351
- By default, any interactions in which either entity is not expressed in above 10% of cells per cell type
2360
+ By default, any interactions in which either entity is not expressed in above 10%% of cells per cell type
2352
2361
  is considered as a false positive, under the assumption that since CCC occurs between cell types, a sufficient
2353
2362
  proportion of cells within should express the genes.
2354
2363
  * `*_means`: entity expression mean per cell type.
@@ -2376,6 +2385,21 @@ class CellCellCommunication(Proc):
2376
2385
  - geometric_mean: alias for `Geometric_Mean`
2377
2386
  - scseqcomm: alias for `scSeqComm`
2378
2387
  - cellchat: alias for `CellChat`
2388
+ subset: An expression in string to subset the cells.
2389
+ When a `.rds` or `.h5seurat` file is provided for `in.sobjfile`, you can provide an expression in `R`,
2390
+ which will be passed to `base::subset()` in `R` to subset the cells.
2391
+ But you can always pass an expression in `python` to subset the cells.
2392
+ See <https://anndata.readthedocs.io/en/latest/tutorials/notebooks/getting-started.html#subsetting-using-metadata>.
2393
+ You should use `adata` to refer to the AnnData object. For example, `adata.obs.groups == "g1"` will subset the cells
2394
+ with `groups` equal to `g1`.
2395
+ subset_using: The method to subset the cells.
2396
+ - auto: Automatically detect the method to use.
2397
+ Note that this is not always accurate. We simply check if `[` is in the expression.
2398
+ If so, we use `python` to subset the cells; otherwise, we use `R`.
2399
+ - python: Use python to subset the cells.
2400
+ - r: Use R to subset the cells.
2401
+ split_by: The column name in metadata to split the cells to run the method separately.
2402
+ The results will be combined together with this column in the final output.
2379
2403
  assay: The assay to use for the analysis.
2380
2404
  Only works for Seurat object.
2381
2405
  seed (type=int): The seed for the random number generator.
@@ -2401,6 +2425,7 @@ class CellCellCommunication(Proc):
2401
2425
  See the method documentation for more details and also
2402
2426
  `help(liana.mt.<method>.__call__)` in Python.
2403
2427
  """ # noqa: E501
2428
+
2404
2429
  input = "sobjfile:file"
2405
2430
  output = "outfile:file:{{in.sobjfile | stem}}-ccc.txt"
2406
2431
  lang = config.lang.python
@@ -2408,6 +2433,9 @@ class CellCellCommunication(Proc):
2408
2433
  "method": "cellchat",
2409
2434
  "assay": None,
2410
2435
  "seed": 1337,
2436
+ "subset": None,
2437
+ "subset_using": "auto",
2438
+ "split_by": None,
2411
2439
  "ncores": config.misc.ncores,
2412
2440
  "groupby": "seurat_clusters",
2413
2441
  "species": "human",
@@ -2455,6 +2483,7 @@ class CellCellCommunicationPlots(Proc):
2455
2483
  See the documentation for more details.
2456
2484
  Or you can use `?CCPlotR::cc_<kind>` in R.
2457
2485
  """
2486
+
2458
2487
  input = "cccfile:file, expfile:file"
2459
2488
  output = "outdir:dir:{{in.cccfile | stem}}-ccc_plots"
2460
2489
  lang = config.lang.rscript
@@ -2467,3 +2496,135 @@ class CellCellCommunicationPlots(Proc):
2467
2496
  plugin_opts = {
2468
2497
  "report": "file://../reports/scrna/CellCellCommunicationPlots.svelte",
2469
2498
  }
2499
+
2500
+
2501
+ class ScVelo(Proc):
2502
+ """Velocity analysis for single-cell RNA-seq data
2503
+
2504
+ This process is implemented based on the Python package `scvelo`.
2505
+
2506
+ Input:
2507
+ sobjfile: The seurat object file in RDS or h5seurat format or AnnData file.
2508
+
2509
+ Output:
2510
+ outfile: The output object with the velocity embeddings and information.
2511
+ In either RDS, h5seurat or h5ad format, depending on the `envs.outtype`.
2512
+ outdir: The output directory for the plots
2513
+
2514
+ Envs:
2515
+ ncores (type=int): Number of cores to use.
2516
+ group_by: The column name in metadata to group the cells.
2517
+ Typically, this column should be the cluster id.
2518
+ reduction: The nonlinear reduction to use for the velocity analysis.
2519
+ Typically, `umap` will be used.
2520
+ If this is not provided, 'pca' will be used if exists, otherwise a
2521
+ PCA will be performed.
2522
+ modes (type=auto): The modes to use for the analysis.
2523
+ A list or a string with comma separated values.
2524
+ fitting_by (choice): The mode to use for fitting the velocities.
2525
+ - stochastic: Stochastic mode
2526
+ - deterministic: Deterministic mode
2527
+ min_shared_counts (type=int): Minimum number of counts
2528
+ (both unspliced and spliced) required for a gene.
2529
+ n_neighbors (type=int): The number of neighbors to use for the velocity graph.
2530
+ n_pcs (type=int): The number of PCs to use for the velocity graph.
2531
+ stream_smooth (type=float): Multiplication factor for scale in Gaussian kernel
2532
+ around grid point.
2533
+ stream_density (type=float): Controls the closeness of streamlines.
2534
+ When density = 2.0, the domain is divided into a 60x60 grid, whereas
2535
+ density linearly scales this grid. Each cell in the grid can have,
2536
+ at most, one traversing streamline. For different densities in each
2537
+ direction, use a tuple (density_x, density_y).
2538
+ arrow_size (type=float): Scaling factor for the arrow size.
2539
+ arrow_length (type=float): Length of arrows.
2540
+ arrow_density (type=float): Density of arrows.
2541
+ denoise (flag): Whether to denoise the data.
2542
+ denoise_topn (type=int): Number of genes with highest likelihood selected to
2543
+ infer velocity directions.
2544
+ kinetics (flag): Whether to compute the RNA velocity kinetics.
2545
+ kinetics_topn (type=int): Number of genes with highest likelihood selected to
2546
+ infer velocity directions.
2547
+ calculate_velocity_genes (flag): Whether to calculate the velocity genes.
2548
+ top_n (type=int): The number of top features to plot.
2549
+ res (type=int): The resolution of the plots.
2550
+ rscript: The path to the Rscript executable used to convert RDS file to AnnData.
2551
+ if `in.sobjfile` is an RDS file, it will be converted to AnnData file
2552
+ (h5ad). You need `Seurat`, `SeuratDisk` and `digest` installed.
2553
+ outtype (choice): The output file type.
2554
+ - input: The same as the input file type.
2555
+ - anndata: AnnData object
2556
+ - h5seurat: h5seurat object
2557
+ - h5ad: h5ad object
2558
+ """
2559
+
2560
+ input = "sobjfile:file"
2561
+ output = "outfile:file:{{in.sobjfile | stem}}-scvelo.{{envs.outtype}}"
2562
+ lang = config.lang.python
2563
+ envs = {
2564
+ "ncores": config.misc.ncores,
2565
+ "group_by": "seurat_clusters",
2566
+ "reduction": "umap",
2567
+ "modes": ["stochastic", "deterministic", "dynamical"],
2568
+ "fitting_by": "stochastic",
2569
+ "min_shared_counts": 30,
2570
+ "n_neighbors": 30,
2571
+ "n_pcs": 30,
2572
+ "stream_smooth": 0.5,
2573
+ "stream_density": 2.0,
2574
+ "arrow_size": 5.0,
2575
+ "arrow_length": 5.0,
2576
+ "arrow_density": 0.5,
2577
+ "denoise": False,
2578
+ "denoise_topn": 3,
2579
+ "kinetics": False,
2580
+ "kinetics_topn": 100,
2581
+ "calculate_velocity_genes": False,
2582
+ "top_n": 6,
2583
+ "res": 100,
2584
+ "rscript": config.lang.rscript,
2585
+ "outtype": "input",
2586
+ }
2587
+ script = "file://../scripts/scrna/ScVelo.py"
2588
+
2589
+
2590
+ class SlingShot(Proc):
2591
+ """Trajectory inference using SlingShot
2592
+
2593
+ This process is implemented based on the R package `slingshot`.
2594
+
2595
+ Input:
2596
+ sobjfile: The seurat object file in RDS.
2597
+
2598
+ Output:
2599
+ outfile: The output object with the trajectory information.
2600
+
2601
+ Envs:
2602
+ group_by: The column name in metadata to group the cells.
2603
+ Typically, this column should be the cluster id.
2604
+ reduction: The nonlinear reduction to use for the trajectory analysis.
2605
+ dims (type=auto): The dimensions to use for the analysis.
2606
+ A list or a string with comma separated values.
2607
+ Consecutive numbers can be specified with a colon (`:`) or a dash (`-`).
2608
+ start: The starting group for the SlingShot analysis.
2609
+ end: The ending group for the SlingShot analysis.
2610
+ prefix: The prefix to add to the column names of the resulting pseudotime variable.
2611
+ reverse (flag): Logical value indicating whether to reverse the pseudotime variable.
2612
+ align_start (flag): Whether to align the starting pseudotime values at the maximum pseudotime.
2613
+ seed (type=int): The seed for the random number generator.
2614
+ """ # noqa: E501
2615
+
2616
+ input = "sobjfile:file"
2617
+ output = "outfile:file:{{in.sobjfile | stem}}.RDS"
2618
+ lang = config.lang.rscript
2619
+ envs = {
2620
+ "group_by": "seurat_clusters",
2621
+ "reduction": None,
2622
+ "dims": [1, 2],
2623
+ "start": None,
2624
+ "end": None,
2625
+ "prefix": None,
2626
+ "reverse": False,
2627
+ "align_start": False,
2628
+ "seed": 8525,
2629
+ }
2630
+ script = "file://../scripts/scrna/SlingShot.R"