biopipen 0.32.1__py3-none-any.whl → 0.33.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of biopipen might be problematic. Click here for more details.

Files changed (134) hide show
  1. biopipen/__init__.py +1 -1
  2. biopipen/core/config.toml +6 -0
  3. biopipen/core/filters.py +77 -26
  4. biopipen/core/testing.py +6 -1
  5. biopipen/ns/bam.py +39 -0
  6. biopipen/ns/cellranger.py +5 -0
  7. biopipen/ns/cellranger_pipeline.py +2 -2
  8. biopipen/ns/cnvkit_pipeline.py +4 -1
  9. biopipen/ns/delim.py +33 -27
  10. biopipen/ns/protein.py +99 -0
  11. biopipen/ns/scrna.py +411 -250
  12. biopipen/ns/snp.py +16 -3
  13. biopipen/ns/tcr.py +125 -1
  14. biopipen/ns/vcf.py +34 -0
  15. biopipen/ns/web.py +5 -1
  16. biopipen/reports/scrna/SeuratClusterStats.svelte +1 -1
  17. biopipen/reports/scrna/SeuratMap2Ref.svelte +15 -2
  18. biopipen/reports/tcr/ClonalStats.svelte +15 -0
  19. biopipen/reports/utils/misc.liq +22 -7
  20. biopipen/scripts/bam/BamMerge.py +2 -2
  21. biopipen/scripts/bam/BamSampling.py +4 -4
  22. biopipen/scripts/bam/BamSort.py +141 -0
  23. biopipen/scripts/bam/BamSplitChroms.py +10 -10
  24. biopipen/scripts/bam/BamSubsetByBed.py +3 -3
  25. biopipen/scripts/bam/CNVpytor.py +10 -10
  26. biopipen/scripts/bam/ControlFREEC.py +11 -11
  27. biopipen/scripts/bed/Bed2Vcf.py +5 -5
  28. biopipen/scripts/bed/BedConsensus.py +5 -5
  29. biopipen/scripts/bed/BedLiftOver.sh +6 -4
  30. biopipen/scripts/bed/BedtoolsIntersect.py +4 -4
  31. biopipen/scripts/bed/BedtoolsMakeWindows.py +3 -3
  32. biopipen/scripts/bed/BedtoolsMerge.py +4 -4
  33. biopipen/scripts/cellranger/CellRangerCount.py +20 -9
  34. biopipen/scripts/cellranger/CellRangerSummary.R +20 -29
  35. biopipen/scripts/cellranger/CellRangerVdj.py +8 -8
  36. biopipen/scripts/cnvkit/CNVkitAccess.py +6 -6
  37. biopipen/scripts/cnvkit/CNVkitAutobin.py +25 -18
  38. biopipen/scripts/cnvkit/CNVkitBatch.py +5 -5
  39. biopipen/scripts/cnvkit/CNVkitCall.py +3 -3
  40. biopipen/scripts/cnvkit/CNVkitCoverage.py +2 -2
  41. biopipen/scripts/cnvkit/CNVkitDiagram.py +5 -5
  42. biopipen/scripts/cnvkit/CNVkitFix.py +3 -3
  43. biopipen/scripts/cnvkit/CNVkitGuessBaits.py +9 -5
  44. biopipen/scripts/cnvkit/CNVkitHeatmap.py +4 -4
  45. biopipen/scripts/cnvkit/CNVkitReference.py +2 -2
  46. biopipen/scripts/cnvkit/CNVkitScatter.py +5 -5
  47. biopipen/scripts/cnvkit/CNVkitSegment.py +5 -5
  48. biopipen/scripts/cnvkit/guess_baits.py +166 -93
  49. biopipen/scripts/delim/SampleInfo.R +85 -139
  50. biopipen/scripts/misc/Config2File.py +2 -2
  51. biopipen/scripts/misc/Str2File.py +2 -2
  52. biopipen/scripts/protein/MMCIF2PDB.py +33 -0
  53. biopipen/scripts/protein/PDB2Fasta.py +60 -0
  54. biopipen/scripts/protein/Prodigy.py +4 -4
  55. biopipen/scripts/protein/RMSD.py +178 -0
  56. biopipen/scripts/regulatory/MotifScan.py +8 -8
  57. biopipen/scripts/scrna/CellCellCommunication.py +59 -22
  58. biopipen/scripts/scrna/CellsDistribution.R +31 -6
  59. biopipen/scripts/scrna/MarkersFinder.R +272 -602
  60. biopipen/scripts/scrna/MetaMarkers.R +16 -7
  61. biopipen/scripts/scrna/RadarPlots.R +75 -35
  62. biopipen/scripts/scrna/SCP-plot.R +15202 -0
  63. biopipen/scripts/scrna/ScVelo.py +0 -0
  64. biopipen/scripts/scrna/SeuratClusterStats-clustree.R +23 -25
  65. biopipen/scripts/scrna/SeuratClusterStats-dimplots.R +26 -47
  66. biopipen/scripts/scrna/SeuratClusterStats-features.R +85 -385
  67. biopipen/scripts/scrna/SeuratClusterStats-ngenes.R +33 -13
  68. biopipen/scripts/scrna/SeuratClusterStats-stats.R +45 -228
  69. biopipen/scripts/scrna/SeuratClusterStats.R +13 -19
  70. biopipen/scripts/scrna/SeuratMap2Ref.R +16 -6
  71. biopipen/scripts/scrna/SeuratPreparing.R +138 -81
  72. biopipen/scripts/scrna/SlingShot.R +71 -0
  73. biopipen/scripts/scrna/TopExpressingGenes.R +9 -7
  74. biopipen/scripts/scrna/celltypist-wrapper.py +7 -6
  75. biopipen/scripts/snp/Plink2GTMat.py +26 -11
  76. biopipen/scripts/snp/PlinkFilter.py +7 -7
  77. biopipen/scripts/snp/PlinkFromVcf.py +8 -5
  78. biopipen/scripts/snp/PlinkSimulation.py +4 -4
  79. biopipen/scripts/snp/PlinkUpdateName.py +4 -4
  80. biopipen/scripts/stats/ChowTest.R +48 -22
  81. biopipen/scripts/tcgamaf/Maf2Vcf.py +2 -2
  82. biopipen/scripts/tcgamaf/MafAddChr.py +2 -2
  83. biopipen/scripts/tcr/CDR3AAPhyschem.R +12 -2
  84. biopipen/scripts/tcr/ClonalStats.R +484 -0
  85. biopipen/scripts/tcr/CloneResidency.R +23 -5
  86. biopipen/scripts/tcr/Immunarch-basic.R +8 -1
  87. biopipen/scripts/tcr/Immunarch-clonality.R +5 -0
  88. biopipen/scripts/tcr/Immunarch-diversity.R +25 -4
  89. biopipen/scripts/tcr/Immunarch-geneusage.R +15 -1
  90. biopipen/scripts/tcr/Immunarch-kmer.R +14 -1
  91. biopipen/scripts/tcr/Immunarch-overlap.R +15 -1
  92. biopipen/scripts/tcr/Immunarch-spectratyping.R +10 -1
  93. biopipen/scripts/tcr/Immunarch-tracking.R +6 -0
  94. biopipen/scripts/tcr/Immunarch-vjjunc.R +33 -0
  95. biopipen/scripts/tcr/ScRepLoading.R +127 -0
  96. biopipen/scripts/tcr/TCRClusterStats.R +24 -7
  97. biopipen/scripts/tcr/TCRDock.py +10 -6
  98. biopipen/scripts/tcr/TESSA.R +6 -1
  99. biopipen/scripts/tcr/vdjtools-patch.sh +1 -1
  100. biopipen/scripts/vcf/BcftoolsAnnotate.py +8 -8
  101. biopipen/scripts/vcf/BcftoolsFilter.py +3 -3
  102. biopipen/scripts/vcf/BcftoolsMerge.py +31 -0
  103. biopipen/scripts/vcf/BcftoolsSort.py +4 -4
  104. biopipen/scripts/vcf/BcftoolsView.py +5 -5
  105. biopipen/scripts/vcf/Vcf2Bed.py +2 -2
  106. biopipen/scripts/vcf/VcfAnno.py +11 -11
  107. biopipen/scripts/vcf/VcfDownSample.sh +22 -10
  108. biopipen/scripts/vcf/VcfFilter.py +5 -5
  109. biopipen/scripts/vcf/VcfFix.py +7 -7
  110. biopipen/scripts/vcf/VcfFix_utils.py +12 -3
  111. biopipen/scripts/vcf/VcfIndex.py +3 -3
  112. biopipen/scripts/vcf/VcfIntersect.py +3 -3
  113. biopipen/scripts/vcf/VcfLiftOver.sh +5 -0
  114. biopipen/scripts/vcf/VcfSplitSamples.py +4 -4
  115. biopipen/scripts/vcf/bcftools_utils.py +3 -3
  116. biopipen/scripts/web/Download.py +8 -4
  117. biopipen/scripts/web/DownloadList.py +5 -5
  118. biopipen/scripts/web/GCloudStorageDownloadBucket.py +5 -5
  119. biopipen/scripts/web/GCloudStorageDownloadFile.py +3 -3
  120. biopipen/scripts/web/gcloud_common.py +1 -1
  121. biopipen/utils/gsea.R +96 -42
  122. biopipen/utils/misc.R +205 -7
  123. biopipen/utils/misc.py +17 -8
  124. biopipen/utils/plot.R +53 -17
  125. biopipen/utils/reference.py +11 -11
  126. biopipen/utils/repr.R +146 -0
  127. biopipen/utils/vcf.py +1 -1
  128. {biopipen-0.32.1.dist-info → biopipen-0.33.0.dist-info}/METADATA +9 -9
  129. {biopipen-0.32.1.dist-info → biopipen-0.33.0.dist-info}/RECORD +131 -122
  130. {biopipen-0.32.1.dist-info → biopipen-0.33.0.dist-info}/WHEEL +1 -1
  131. biopipen/scripts/scrna/SeuratClusterStats-hists.R +0 -139
  132. biopipen/scripts/scrna/SeuratPreparing-common.R +0 -452
  133. biopipen/scripts/scrna/SeuratPreparing-doublet_detection.R +0 -201
  134. {biopipen-0.32.1.dist-info → biopipen-0.33.0.dist-info}/entry_points.txt +0 -0
@@ -1,12 +1,12 @@
1
1
  from biopipen.utils.misc import run_command, dict_to_cli_args
2
2
 
3
- infile = {{in.infile | quote}} # pyright: ignore
3
+ infile: str = {{in.infile | quote}} # pyright: ignore # noqa
4
4
  outfile = {{out.outfile | quote}} # pyright: ignore
5
5
  outdir = {{out.outdir | quote}} # pyright: ignore
6
6
  perl = {{envs.perl | quote}} # pyright: ignore
7
7
  ref = {{envs.ref | repr}} # pyright: ignore
8
8
  samtools = {{envs.samtools | quote}} # pyright: ignore
9
- args = {{envs.args | repr}} # pyright: ignore
9
+ args: dict = {{envs.args | dict}} # pyright: ignore
10
10
  maf2vcf = {{biopipen_dir | append: "/scripts/tcgamaf/maf2vcf.pl" | repr}} # pyright: ignore
11
11
 
12
12
  args['input-maf'] = infile
@@ -1,6 +1,6 @@
1
1
 
2
- infile = {{in.infile | quote}} # pyright: ignore
3
- outfile = {{out.outfile | quote}} # pyright: ignore
2
+ infile: str = {{in.infile | quote}} # pyright: ignore # noqa
3
+ outfile: str = {{out.outfile | quote}} # pyright: ignore
4
4
 
5
5
  with open(infile) as fin, open(outfile, "w") as fout:
6
6
  for line in fin:
@@ -385,6 +385,10 @@ do_one_subset = function(s) {
385
385
  print(g)
386
386
  dev.off()
387
387
 
388
+ pdf(file.path(odir, "estimated_coefficients.pdf"), width=10, height=10)
389
+ print(g)
390
+ dev.off()
391
+
388
392
  add_report(
389
393
  list(
390
394
  kind = "descr",
@@ -408,7 +412,8 @@ do_one_subset = function(s) {
408
412
  contents = list(
409
413
  list(
410
414
  kind = "image",
411
- src = file.path(odir, "estimated_coefficients.png")
415
+ src = file.path(odir, "estimated_coefficients.png"),
416
+ download = file.path(odir, "estimated_coefficients.pdf")
412
417
  )
413
418
  )
414
419
  ),
@@ -465,6 +470,10 @@ do_one_subset = function(s) {
465
470
  print(g)
466
471
  dev.off()
467
472
 
473
+ pdf(file.path(odir, "distribution.pdf"), width=10, height=10)
474
+ print(g)
475
+ dev.off()
476
+
468
477
  add_report(
469
478
  list(
470
479
  kind = "table_image",
@@ -473,7 +482,8 @@ do_one_subset = function(s) {
473
482
  "then scaled to have a mean of 0 and a variance of 1. ",
474
483
  "Horizontal lines depict the mean for each population"
475
484
  ),
476
- src = file.path(odir, "distribution.png")
485
+ src = file.path(odir, "distribution.png"),
486
+ download = file.path(odir, "distribution.pdf")
477
487
  ),
478
488
  h1 = ifelse(
479
489
  is.null(s),
@@ -0,0 +1,484 @@
1
+ library(rlang)
2
+ library(glue)
3
+ library(scplotter)
4
+ library(biopipen.utils)
5
+
6
+ screpfile <- {{in.screpfile | quote}}
7
+ outdir <- {{out.outdir | quote}}
8
+ joboutdir <- {{job.outdir | quote}}
9
+ envs <- {{envs | r}}
10
+ mutaters <- envs$mutaters
11
+ cases <- envs$cases
12
+ envs$mutaters <- NULL
13
+ envs$cases <- NULL
14
+
15
+ log <- get_logger()
16
+ reporter <- get_reporter()
17
+
18
+ VIZ_TYPE_TO_SECTION <- list(
19
+ volume = "Number of Clones",
20
+ abundance = "Clonal Abundance",
21
+ length = "Clonal Sequence Length",
22
+ residency = "Clonal Residency",
23
+ dynamics = "Clonal Dynamics",
24
+ composition = "Clonal Composition",
25
+ overlap = "Clonal Overlap",
26
+ diversity = "Clonal Diversity",
27
+ geneusage = "Gene Usage",
28
+ positional = "Positional Properties",
29
+ kmer = "Kmer Analysis",
30
+ rarefaction = "Rarefaction Analysis"
31
+ )
32
+
33
+ get_plot_descr <- function(viz_type, case) {
34
+ if (identical(viz_type, "volume")) {
35
+ if (identical(case$plot_type %||% "bar", "bar")) {
36
+ out <- glue(
37
+ "This bar graph illustrates the distribution of unique clones across {x}(s). ",
38
+ "The x-axis represents the different {x}(s), while the y-axis denotes the number of unique clones.",
39
+ x = case$x %||% "Sample")
40
+ } else { # box/violin
41
+ out <- glue(
42
+ "This {case$plot_type} plot compares the distribution of unique clones across {x}(s). ",
43
+ "The x-axis represents {x}(s), and the data points were broken down by samples, while the y-axis denotes the number of unique clones.",
44
+ x = case$x)
45
+ if (!is.null(case$comparisons)) {
46
+ out <- glue("{out} The p-values of the comparisons are performed using {case$pairwise_method %||% 'wilcox.test'}.")
47
+ }
48
+ }
49
+ out <- glue("{out} The clones are identified by {case$clone_call %||% 'aa'} and {case$chain %||% 'both'} chain(s) was/were used.")
50
+ } else if (identical(viz_type, "abundance")) {
51
+ if ((case$plot_type %||% "trend") %in% c("trend", "histogram")) {
52
+ out <- glue(
53
+ "The abundance plot illustrates the number of clones at different abundance levels. ",
54
+ "The x-axis represents the abundance levels (different sizes of clones), while the y-axis denotes the number of clones at each level."
55
+ )
56
+ } else { # density
57
+ out <- glue(
58
+ "The abundance plot illustrates the distribution of clones at different abundance levels. ",
59
+ "The x-axis represents the abundance levels (different sizes of clones), while the y-axis denotes the density of clones at each level."
60
+ )
61
+ }
62
+ out <- glue("{out} The clones are identified by {case$clone_call %||% 'aa'} and {case$chain %||% 'both'} chain(s) was/were used.")
63
+ } else if (identical(viz_type, "length")) {
64
+ if (identical(case$plot_type %||% "bar", "bar")) {
65
+ out <- glue(
66
+ "This bar graph illustrates the distribution of the length of the CDR3 sequences. ",
67
+ "The x-axis represents the different lengths of the CDR3 sequences, while the y-axis denotes the number of corresponding sequences. ",
68
+ "{case$chain %||% 'both'} chain(s) was/were counted."
69
+ )
70
+ } else if (identical(case$plot_type, "density")) {
71
+ out <- glue(
72
+ "This density plot illustrates the distribution of the length of the CDR3 sequences. ",
73
+ "The x-axis represents the different lengths of the CDR3 sequences, while the y-axis denotes the density of corresponding sequences. ",
74
+ "{case$chain %||% 'both'} chain(s) was/were counted."
75
+ )
76
+ } else { # box/violin
77
+ out <- glue(
78
+ "This {case$plot_type} plot compares the distribution of the length of the CDR3 sequences. ",
79
+ "The x-axis represents the different lengths of the CDR3 sequences, while the y-axis denotes the number of corresponding sequences. ",
80
+ "{case$chain %||% 'both'} chain(s) was/were counted."
81
+ )
82
+ if (!is.null(case$comparisons)) {
83
+ out <- glue("{out} The p-values of the comparisons are performed using {case$pairwise_method %||% 'wilcox.test'}.")
84
+ }
85
+ }
86
+ } else if (identical(viz_type, "residency")) {
87
+ if (identical(case$plot_type %||% "scatter", "scatter")) {
88
+ out <- glue(
89
+ "This scatter plot illustrates the residency of clones across different groups (axes). ",
90
+ "The x-axis represents the first group, while the y-axis denotes the second group. ",
91
+ "The size of the points represents the size of the clones. For the shared clones, the size of the points ",
92
+ "represents the {case$scatter_size_by %||% 'max'} size of the clones in the groups. ",
93
+ "The color of the points represents the category of the clones. 'Singlet' are the clones with a single cell; ",
94
+ "'Expanded' are the clones with multiple cells; 'Dual' are the clones with cells in both groups; ",
95
+ "'Dual (g1 > g2)' are the clones with cells in both groups and more cells in the first group; ",
96
+ "'Dual (g1 < g2)' are the clones with cells in both groups and more cells in the second group; and ",
97
+ "'Dual (Equal)' are the clones with cells in both groups and equal cells in both groups. ",
98
+ "The {case$scatter_cor %||% 'pearson'} correlation was calculated based on the size of the shared clones and p-value ",
99
+ "is also shown in the subtitle."
100
+ )
101
+ } else if (identical(case$plot_type, "venn")) {
102
+ out <- glue(
103
+ "This {case$plot_type} plot illustrates the residency of clones across different groups. ",
104
+ "The categories of the clones are shown in the Venn diagram, and the color represents the size the category. ",
105
+ "The number of singlets are also annotated in the plot."
106
+ )
107
+ } else { # upset
108
+ out <- glue(
109
+ "This {case$plot_type} plot illustrates the residency of clones across different groups. ",
110
+ "The categories of the clones are shown as rows in the bottom table of the plot. The intersections of the categories ",
111
+ "are shown as the connected lines in the table and the size of the intersections are shown as the bars on the top of ",
112
+ "the plot. The color of the bars represents the size of the intersections."
113
+ )
114
+ }
115
+ out <- glue("{out} The clones are identified by {case$clone_call %||% 'aa'} and {case$chain %||% 'both'} chain(s) was/were used.")
116
+ } else if (identical(viz_type, "dynamics")) {
117
+ if (case$plot_type %in% c("sankey", "alluvial")) {
118
+ out <- glue(
119
+ "This {case$plot_type} plot illustrates the dynamics of clones across different groups. ",
120
+ "The bars are showing the groups and the flow/links are showing the transitions of the clones. "
121
+ )
122
+ } else { # trend
123
+ out <- glue(
124
+ "This trend plot illustrates the dynamics of clones across different groups. ",
125
+ "The x-axis represents the groups, while the y-axis denotes the number/fraction of clones. ",
126
+ "The links between the groups are showing the transitions of the clones. "
127
+ )
128
+ }
129
+ out <- glue("{out} The clones are identified by {case$clone_call %||% 'aa'} and {case$chain %||% 'both'} chain(s) was/were used.")
130
+ } else if (identical(viz_type, "composition")) {
131
+ plot_type <- case$plot_type %||% "bar"
132
+ method <- case$method %||% "homeostasis"
133
+ if (plot_type %in% c("bar", "ring")) {
134
+ out <- glue("This {plot_type} graph illustrates the composition of the categories of the clones.")
135
+ } else { # box/violin
136
+ out <- glue(
137
+ "This {plot_type} plot compares the composition of the categories of the clones. ",
138
+ "For each category in the x-axis, the values are from each sample. "
139
+ )
140
+ if (!is.null(case$comparisons)) {
141
+ out <- glue("{out} The p-values of the comparisons are performed using {case$pairwise_method %||% 'wilcox.test'}.")
142
+ }
143
+ }
144
+ if (method %in% c("homeostasis", "homeo", "rel")) {
145
+ out <- glue("{out} The clone categories are defined based on the relative abundance of the clones in the samples.")
146
+ } else if (method == "top") {
147
+ out <- glue(
148
+ "{out} The clone categories are defined based on the size of the clones in the samples. ",
149
+ "The largest clone ranks as 1, and clones are marked by their indexes."
150
+ )
151
+ } else if (method == "rare") {
152
+ out <- glue(
153
+ "{out} The clone categories are defined based on the size of the clones in the samples. ",
154
+ "The clones are categorized literally based on the size of the clones. For example, ",
155
+ "1 means the singlet clones."
156
+ )
157
+ }
158
+ out <- glue("{out} The clones are identified by {case$clone_call %||% 'aa'} and {case$chain %||% 'both'} chain(s) was/were used.")
159
+ } else if (identical(viz_type, "overlap")) {
160
+ out <- glue(
161
+ "This heatmaps illustrates the overlap of clones across different groups, using \"{case$method %||% 'raw'}\" as the metric. "
162
+ )
163
+ if ((case$method %||% "raw") == "raw") {
164
+ out <- glue("{out} The 'raw' metric is the raw size of the intersection of the clones. ")
165
+ } else if (case$method == "overlap") {
166
+ out <- glue(
167
+ "{out} The 'overlap' metric, a.k.a. 'overlap coefficient' or 'Szymkiewicz-Simpson coefficient', ",
168
+ "is calculated as the size of the intersection divided by the size of the smaller set. "
169
+ )
170
+ } else if (case$method == "morisita") {
171
+ out <- glue("{out} The 'morisita' metric, a.k.a. 'Morisita`s overlap index', ",
172
+ "named after Masaaki Morisita, is a statistical measure of dispersion of individuals in a population. ",
173
+ "This formula is based on the assumption that increasing the size of the samples will increase the diversity ",
174
+ "because it will include different habitats. The value is 0 if the two samples do not overlap in terms of species, ",
175
+ "and 1 if the species occur in the same proportions in both samples."
176
+ )
177
+ } else if (case$method == "jaccard") {
178
+ out <- glue(
179
+ "{out} The 'jaccard' metric, a.k.a. 'Jaccard index', ",
180
+ "is a statistic used for gauging the similarity and diversity of sample sets. ",
181
+ "It is defined in general taking the ratio of two sizes (areas or volumes), ",
182
+ "the intersection size divided by the union size, also called intersection over union."
183
+ )
184
+ } else if (case$method == "cosine") {
185
+ out <- glue("{out} The 'cosine' metric, a.k.a. 'cosine similarity', ",
186
+ "is calculated as the dot product of the vectors divided by the product of the magnitudes of the vectors. "
187
+ )
188
+ }
189
+ out <- glue("{out} The clones are identified by {case$clone_call %||% 'aa'} and {case$chain %||% 'both'} chain(s) was/were used.")
190
+ } else if (identical(viz_type, "diversity")) {
191
+ if (identical(case$plot_type %||% "bar", "bar")) {
192
+ out <- glue(
193
+ "This bar graph illustrates the diversity of the clones across different groups. ",
194
+ "The x-axis represents the groups, while the y-axis denotes the diversity of the clones. "
195
+ )
196
+ } else { # box/violin
197
+ out <- glue(
198
+ "This {case$plot_type} plot compares the diversity of the clones across different groups. ",
199
+ "The x-axis represents the groups, while the y-axis denotes the diversity of the clones. ",
200
+ "For each group on the x-axis, the values are calculated from each sample. "
201
+ )
202
+ if (!is.null(case$comparisons)) {
203
+ out <- glue("{out} The p-values of the comparisons are performed using {case$pairwise_method %||% 'wilcox.test'}.")
204
+ }
205
+ }
206
+ if (identical(case$method %||% "shannon", "shannon")) {
207
+ out <- glue(
208
+ "{out} The diversity is calculated using the Shannon index, ",
209
+ "a statistic that estimates the diversity of a biological community by considering both species richness and evenness. ",
210
+ "The index is calculated as the negative sum of the product of the proportion of each species and the ",
211
+ "logarithm of that proportion. If practically all abundance is concentrated to one type, and the other ",
212
+ "types are very rare (even if there are many of them), Shannon entropy approaches zero. ",
213
+ "When there is only one type in the dataset, Shannon entropy exactly equals zero."
214
+ )
215
+ } else if (identical(case$method, "inv.simpson")) {
216
+ out <- glue(
217
+ "{out} The diversity is calculated using the Inverse Simpson index, simply equals true diversity of order 2. ",
218
+ "It is calculated as the reciprocal of the sum of the squares of the proportion of each species. "
219
+ )
220
+ } else if (identical(case$method, "norm.entropy")) {
221
+ out <- glue(
222
+ "{out} The diversity is calculated using the Normalized Shannon entropy, ",
223
+ "a statistic that estimates the diversity of a biological community by considering both species richness and evenness. ",
224
+ "The index is calculated as the negative sum of the product of the proportion of each species and the ",
225
+ "logarithm of that proportion. If practically all abundance is concentrated to one type, and the other ",
226
+ "types are very rare (even if there are many of them), Shannon entropy approaches zero. ",
227
+ "When there is only one type in the dataset, Shannon entropy exactly equals zero. ",
228
+ "The normalized Shannon entropy is calculated as the Shannon entropy divided by the maximum possible entropy. ",
229
+ "The maximum possible entropy is the Shannon entropy when all species are equally abundant."
230
+ )
231
+ } else if (identical(case$method, "gini.simpson")) {
232
+ out <- glue(
233
+ "{out} The diversity is calculated using the Gini-Simpson index, ",
234
+ "a.k.a Gini impurity. The original Simpson index λ equals the probability that two entities taken at random ",
235
+ "from the dataset of interest (with replacement) represent the same type. Its transformation 1 - λ, ",
236
+ "therefore, equals the probability that the two entities represent different types."
237
+ )
238
+ } else if (identical(case$method, "chao1")) {
239
+ out <- glue(
240
+ "{out} The diversity is calculated using the Chao1 index. ",
241
+ "The Chao1 index is a lower bound estimate of the true species richness of a community. ",
242
+ "It is based on the number of singletons and doubletons in a sample. ",
243
+ "The Chao1 index is calculated as the observed number of species plus the square of the number of singlets ",
244
+ "divided by twice the number of doublets. "
245
+ )
246
+ } else if (identical(case$method, "ACE")) {
247
+ out <- glue(
248
+ "{out} The diversity is calculated using the Abundance-based Coverage Estimator (ACE) index. ",
249
+ "a statistical method used to estimate the number of species in a community or ecosystem. ",
250
+ "The ACE index is calculated as the sum of the estimated number of species in the sample. ",
251
+ "The ACE index is based on the abundance of species and estimates the number of additional ",
252
+ "species that are likely to be present but have not been observed. ",
253
+ "Higher ACE values indicate higher species diversity."
254
+ )
255
+ } else if (identical(case$method, "gini.coeff")) {
256
+ out <- glue(
257
+ "{out} The diversity is calculated using the Gini coefficient. ",
258
+ "The Gini coefficient is a measure of statistical dispersion intended to represent the income or wealth distribution of a nation's residents. ",
259
+ "The Gini coefficient ranges from 0 to 1, where 0 represents perfect equality and 1 represents perfect inequality. ",
260
+ "A Gini coefficient of 0 means that all individuals have the same income, while a Gini coefficient of 1 means that one individual has all the income and the rest have none."
261
+ )
262
+ } else if (case$method %in% c("d50", "dXX")) {
263
+ out <- glue(
264
+ "{out} The diversity is calculated using the {case$method} index. ",
265
+ "The {case$method} index a metric used to measure the diversity of an immune repertoire by calculating the ",
266
+ "percentage of unique clonotype that account for the top {case[['d']] %||% 50}% of all clones. "
267
+ )
268
+ }
269
+ out <- glue("{out} The clones are identified by {case$clone_call %||% 'aa'} and {case$chain %||% 'both'} chain(s) was/were used.")
270
+ } else if (viz_type == "geneusage") {
271
+ if (identical(case$plot_type %||% "bar", "bar")) {
272
+ out <- glue(
273
+ "This bar graph illustrates the distribution of the gene usage of the clones. ",
274
+ "The x-axis represents the different genes, while the y-axis denotes the fraction of the genes. "
275
+ )
276
+ } else if (identical(case$plot_type, "heatmap")) {
277
+ out <- glue(
278
+ "This heatmap illustrates the distribution of the gene usage of the clones in different groups. ",
279
+ "The color of the heatmap represents the fraction of the genes. "
280
+ )
281
+ } else if (case$plot_type %in% c("circos", "chord")) {
282
+ out <- glue("This {case$plot_type} plot illustrates the distribution of the gene usage of the clones")
283
+ if (length(case$genes) == 1) {
284
+ out <- glue(
285
+ "{out} for the '{case$genes[[1]]}' genes in different groups. ",
286
+ "The links are showing the composition of the genes in the groups. "
287
+ )
288
+ } else {
289
+ out <- glue(
290
+ "{out} for the {paste(case$genes, collapse = ' and ')} genes. ",
291
+ "The links are showing the connections between the genes. "
292
+ )
293
+ }
294
+ } else { # sankey/alluvial
295
+ out <- glue(
296
+ "This {case$plot_type} plot illustrates the distribution of the gene usage of the clones in different groups. ",
297
+ "The bars are showing the groups and the flow/links are showing the transitions of the gene usages. "
298
+ )
299
+ }
300
+ } else if (viz_type == "positional") {
301
+ if ((case$plot_type %||% "bar") %in% c("bar", "line")) {
302
+ out <- glue(
303
+ "This {case$plot_type %||% 'bar'} graph illustrates the distribution of the positional properties of the amino acids in the CDR3 sequences. "
304
+ )
305
+ } else if (identical(case$plot_type, "heatmap")) {
306
+ out <- glue(
307
+ "This heatmap illustrates the distribution of the positional properties of the amino acids in the CDR3 sequences ",
308
+ "in different groups. The color of the heatmap represents the intensity of the properties. "
309
+ )
310
+ } else if (case$plot_type %in% c("violin", "box")) {
311
+ out <- glue(
312
+ "This {case$plot_type} plot compares the distribution of the positional properties of the amino acids in the CDR3 sequences. ",
313
+ "The x-axis represents the different positions of the amino acids, while the y-axis denotes the intensity of the properties. ",
314
+ "For each position on the x-axis, the values are calculated from each sample. "
315
+ )
316
+ if (!is.null(case$comparisons)) {
317
+ out <- glue("{out} The p-values of the comparisons are performed using {case$pairwise_method %||% 'wilcox.test'}.")
318
+ }
319
+ }
320
+ if (identical(case$method %||% "AA", "AA")) {
321
+ out <- glue("{out} The positional properties are calculated based on amino acid frequency at each position.")
322
+ } else if (identical(case$method, "shannon")) {
323
+ out <- glue(
324
+ "{out} The positional properties are calculated based on the Shannon entropy of the amino acids at each position. ",
325
+ "The Shannon entropy is a statistic that estimates the diversity of a biological community by considering both species richness and evenness. ",
326
+ "The index is calculated as the negative sum of the product of the proportion of each species and the ",
327
+ "logarithm of that proportion. If practically all abundance is concentrated to one type, and the other ",
328
+ "types are very rare (even if there are many of them), Shannon entropy approaches zero."
329
+ )
330
+ } else if (identical(case$method, "inv.simpson")) {
331
+ out <- glue(
332
+ "{out} The positional properties are calculated based on the Inverse Simpson index of the amino acids at each position. ",
333
+ "The Inverse Simpson index is calculated as the reciprocal of the sum of the squares of the proportion of each species. "
334
+ )
335
+ } else if (identical(case$method, "norm.entropy")) {
336
+ out <- glue(
337
+ "{out} The positional properties are calculated based on the Normalized Shannon entropy of the amino acids at each position. ",
338
+ "The Normalized Shannon entropy is a statistic that estimates the diversity of a biological community by considering both species richness and evenness. ",
339
+ "The index is calculated as the negative sum of the product of the proportion of each species and the ",
340
+ "logarithm of that proportion. If practically all abundance is concentrated to one type, and the other ",
341
+ "types are very rare (even if there are many of them), Shannon entropy approaches zero. ",
342
+ "When there is only one type in the dataset, Shannon entropy exactly equals zero. ",
343
+ "The normalized Shannon entropy is calculated as the Shannon entropy divided by the maximum possible entropy. ",
344
+ "The maximum possible entropy is the Shannon entropy when all species are equally abundant."
345
+ )
346
+ } else if (identical(case$method, "Atchley")) {
347
+ out <- glue(
348
+ "{out} The positional properties are calculated based on the Atchley factors of the amino acids at each position. ",
349
+ "The Atchley factors are a set of six factors that describe the amino acids based on their physicochemical properties. ",
350
+ "The factors are polarity, secondary structure, volume, codon diversity, electrostatic charge, and solvent accessibility. "
351
+ )
352
+ } else if (identical(case$method, "Kidera")) {
353
+ out <- glue(
354
+ "{out} The positional properties are calculated based on the Kidera factors of the amino acids at each position. ",
355
+ "The Kidera factors are a set of ten factors that describe the amino acids based on their physicochemical properties. ",
356
+ "The factors are hydrophobicity, hydrophilicity, side chain mass, pK1, pK2, pI, side chain pK, alpha helix, beta sheet, and turn. "
357
+ )
358
+ } else if (identical(case$method, "stScales")) {
359
+ out <- glue(
360
+ "{out} The positional properties are calculated based on the stScales of the amino acids at each position. ",
361
+ "The stScales were proposed by Yang et al, taking 827 properties into account which are mainly constitutional, ",
362
+ "topological, geometrical, hydrophobic, elec- tronic, and steric properties. "
363
+ )
364
+ } else if (identical(case$method, "tScales")) {
365
+ out <- glue(
366
+ "{out} The positional properties are calculated based on the tScales of the amino acids at each position. ",
367
+ "The tScales are a descriptor set for amino acids that are based on topological descriptors. "
368
+ )
369
+ } else if (identical(case$method, "VHSE")) {
370
+ out <- glue(
371
+ "{out} The positional properties are calculated based on Vectors of Hydrophobic, Steric, and Electronic (VHSE) properties of the amino acids at each position. "
372
+ )
373
+ }
374
+ out <- glue("{out} The properties are calculated based on the {case$chain %||% 'both'} chain(s).")
375
+ } else if (viz_type == "kmer") {
376
+ if ((case$plot_type %||% "bar") %in% c("bar", "line")) {
377
+ out <- glue(
378
+ "This {case$plot_type %||% 'bar'} graph illustrates the distribution of the kmers of the CDR3 sequences. "
379
+ )
380
+ } else if (case$plot_type == "heatmap") {
381
+ out <- glue(
382
+ "This heatmap illustrates the distribution of the kmers of the CDR3 sequences in different groups. ",
383
+ "The color of the heatmap represents the frequency of the kmers. "
384
+ )
385
+ }
386
+ out <- glue("{out} The kmers are calculated based on the {case$k %||% 3}-mers of the CDR3 sequences, and {case$chain %||% 'both'} chain(s) was/were used.")
387
+ } else if (viz_type == "rarefaction") {
388
+ out <- glue(
389
+ "This rarefaction curve illustrates the number of unique clones as a function of the number of cells. ",
390
+ "The x-axis represents the number of cells, while the y-axis denotes the number of unique clones. ",
391
+ "The solid line represents the observed number of unique clones, while the dashed line represents the expected number of unique clones. ",
392
+ "The clones are identified by {case$clone_call %||% 'aa'} and {case$chain %||% 'both'} chain(s) was/were used."
393
+ )
394
+ }
395
+
396
+ out
397
+ }
398
+
399
+ log$info("Loading scRepertoire object ...")
400
+ screp <- readRDS(screpfile)
401
+
402
+ log$info("Applying mutaters if any ...")
403
+ screp <- ScRepMutate(screp, mutaters)
404
+
405
+ log$info("Making cases ...")
406
+ cases <- expand_cases(cases, envs)
407
+ viz_types <- list()
408
+ for (name in names(cases)) {
409
+ case <- cases[[name]]
410
+ if (is.null(case$viz_type)) {
411
+ stop("Error: Visualization type is not defined for case '", name, "'")
412
+ }
413
+ if (!case$viz_type %in% names(VIZ_TYPE_TO_SECTION)) {
414
+ stop("Error: Unknown visualization type '", case$viz_type, "' for case '", name,
415
+ "'. Available types: ", paste(names(VIZ_TYPE_TO_SECTION), collapse = ", "))
416
+ }
417
+ if (!grepl("::", name, fixed = TRUE)) {
418
+ viz_types[[case$viz_type]] <- viz_types[[case$viz_type]] %||% 0
419
+ viz_types[[case$viz_type]] <- viz_types[[case$viz_type]] + 1
420
+ }
421
+ }
422
+ cases <- list_rename(cases, function(name, case) {
423
+ if (!grepl("::", name, fixed = TRUE) && viz_types[[case$viz_type]] > 1) {
424
+ section <- VIZ_TYPE_TO_SECTION[[case$viz_type]]
425
+ return(paste0(section, "::", name))
426
+ }
427
+ return(TRUE)
428
+ })
429
+
430
+ do_case <- function(name, case) {
431
+ log$info("- Processing case: {name}")
432
+ info <- case_info(name, outdir, is_dir = FALSE, create = TRUE)
433
+
434
+ case <- extract_vars(case, "viz_type", "descr", "devpars", "more_formats", "save_code", subset_ = "subset")
435
+
436
+ if (!is.null(subset_)) {
437
+ case$data <- ScRepSubset(screp, subset_)
438
+ } else {
439
+ case$data <- screp
440
+ }
441
+
442
+ plot_fn <- paste0("Clonal", tools::toTitleCase(viz_type), "Plot")
443
+ plot_fn <- utils::getFromNamespace(plot_fn, "scplotter")
444
+ if (is.null(plot_fn)) {
445
+ stop("Error: Unknown visualization type: ", viz_type)
446
+ }
447
+
448
+ p <- do_call(plot_fn, case)
449
+ save_plot(p, info$prefix, devpars, formats = unique(c("png", more_formats)))
450
+
451
+ report <- list(
452
+ kind = "table_image",
453
+ src = paste0(info$prefix, ".png"),
454
+ download = list(),
455
+ descr = html_escape(descr %||% get_plot_descr(viz_type, case)),
456
+ name = html_escape(info$name)
457
+ )
458
+ exformats <- setdiff(more_formats, "png")
459
+ if (length(exformats) > 0) {
460
+ report$download <- lapply(exformats, function(fmt) {
461
+ paste0(info$prefix, ".", fmt)
462
+ })
463
+ }
464
+
465
+ if (isTRUE(save_code)) {
466
+ save_plotcode(
467
+ p,
468
+ setup = c('library(scplotter)', '', 'load("data.RData")'),
469
+ prefix = info$prefix,
470
+ "case"
471
+ )
472
+ report$download <- c(report$download, list(list(
473
+ src = paste0(info$prefix, ".code.zip"),
474
+ tip = "Download the code to reproduce the plot",
475
+ icon = "Code"
476
+ )))
477
+ }
478
+
479
+ reporter$add2(report, hs = c(info$section, info$name), ui = "table_of_images:2")
480
+ }
481
+
482
+ lapply(names(cases), function(name) do_case(name, cases[[name]]))
483
+
484
+ reporter$save(joboutdir)
@@ -500,10 +500,16 @@ handle_subject <- function(i, subjects, casename, case) {
500
500
  print(scatter_p)
501
501
  dev.off()
502
502
 
503
+ scatter_pdf <- gsub(".png$", ".pdf", scatter_png)
504
+ pdf(scatter_pdf, width = 10, height = 8)
505
+ print(scatter_p)
506
+ dev.off()
507
+
503
508
  add_report(
504
509
  list(
505
510
  name = paste0(subject, " (", pair[1], " - ", pair[2], ")"),
506
- src = scatter_png
511
+ src = scatter_png,
512
+ download = scatter_pdf
507
513
  ),
508
514
  h1 = h$h1,
509
515
  h2 = h$h2,
@@ -515,13 +521,19 @@ handle_subject <- function(i, subjects, casename, case) {
515
521
  # upset/venn
516
522
  venn_dir <- file.path(casedir, "venn")
517
523
  venn_png <- file.path(venn_dir, paste0("venn_", slugify(subject), ".png"))
524
+ venn_pdf <- gsub(".png$", ".pdf", venn_png)
525
+ p <- plot_venndg(counts, groups, singletons)
518
526
  png(venn_png, res = 100, height = 600, width = 800)
519
- print(plot_venndg(counts, groups, singletons))
527
+ print(p)
528
+ dev.off()
529
+
530
+ pdf(venn_pdf, width = 8, height = 6)
531
+ print(p)
520
532
  dev.off()
521
533
 
522
534
  h <- headings(case$section, casename, "Overlapping Clones (Venn Diagram)")
523
535
  add_report(
524
- list(src = venn_png, name = subject),
536
+ list(src = venn_png, name = subject, download = venn_pdf),
525
537
  h1 = h$h1,
526
538
  h2 = h$h2,
527
539
  h3 = h$h3,
@@ -530,13 +542,19 @@ handle_subject <- function(i, subjects, casename, case) {
530
542
 
531
543
  upset_dir <- file.path(casedir, "upset")
532
544
  upset_png <- file.path(upset_dir, paste0("upset_", slugify(subject), ".png"))
545
+ upset_pdf <- gsub(".png$", ".pdf", upset_png)
546
+ p <- plot_upset(counts, singletons, case$upset_ymax, case$upset_trans)
533
547
  png(upset_png, res = 100, height = 600, width = 800)
534
- print(plot_upset(counts, singletons, case$upset_ymax, case$upset_trans))
548
+ print(p)
549
+ dev.off()
550
+
551
+ pdf(upset_pdf, width = 8, height = 6)
552
+ print(p)
535
553
  dev.off()
536
554
 
537
555
  h <- headings(case$section, casename, "Overlapping Clones (UpSet Plots)")
538
556
  add_report(
539
- list(src = upset_png, name = subject),
557
+ list(src = upset_png, name = subject, download = upset_pdf),
540
558
  h1 = h$h1,
541
559
  h2 = h$h2,
542
560
  h3 = h$h3,
@@ -63,15 +63,22 @@ do_one_case_basic = function(name, case, method) {
63
63
  } else {
64
64
  p = vis(exp, .by = case$by, .meta = d$meta)
65
65
  }
66
+
66
67
  ofig = file.path(odir, paste0(name, ".png"))
67
68
  png(ofig, width = case$devpars$width, height = case$devpars$height, res = case$devpars$res)
68
69
  print(p + scale_fill_biopipen())
69
70
  dev.off()
70
71
 
72
+ ofig_pdf = file.path(odir, paste0(name, ".pdf"))
73
+ pdf(ofig_pdf, width = case$devpars$width / case$devpars$res, height = case$devpars$height / case$devpars$res)
74
+ print(p + scale_fill_biopipen())
75
+ dev.off()
76
+
71
77
  add_report(
72
78
  list(
73
79
  src = ofig,
74
- name = if (name == "DEFAULT") NULL else name
80
+ name = if (name == "DEFAULT") NULL else name,
81
+ download = ofig_pdf
75
82
  ),
76
83
  h1 = "Exploratory Analysis",
77
84
  h2 = switch(method,
@@ -78,6 +78,11 @@ do_one_case_clonality = function(name, case, method) {
78
78
  print(p)
79
79
  dev.off()
80
80
 
81
+ ofig_pdf = file.path(odir, paste0(name, ".pdf"))
82
+ pdf(ofig_pdf, width = case$devpars$width / case$devpars$res, height = case$devpars$height / case$devpars$res)
83
+ print(p)
84
+ dev.off()
85
+
81
86
  add_report(
82
87
  list(
83
88
  src = ofig,