biopipen 0.33.1__py3-none-any.whl → 0.34.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of biopipen might be problematic. Click here for more details.

Files changed (150) hide show
  1. biopipen/__init__.py +1 -1
  2. biopipen/core/filters.py +10 -183
  3. biopipen/core/proc.py +5 -3
  4. biopipen/core/testing.py +8 -1
  5. biopipen/ns/bam.py +40 -4
  6. biopipen/ns/cnv.py +1 -1
  7. biopipen/ns/cnvkit.py +1 -1
  8. biopipen/ns/delim.py +1 -1
  9. biopipen/ns/gsea.py +63 -37
  10. biopipen/ns/misc.py +38 -0
  11. biopipen/ns/plot.py +8 -0
  12. biopipen/ns/scrna.py +328 -292
  13. biopipen/ns/scrna_metabolic_landscape.py +207 -366
  14. biopipen/ns/tcr.py +165 -97
  15. biopipen/reports/bam/CNVpytor.svelte +4 -9
  16. biopipen/reports/cnvkit/CNVkitDiagram.svelte +1 -1
  17. biopipen/reports/cnvkit/CNVkitHeatmap.svelte +1 -1
  18. biopipen/reports/cnvkit/CNVkitScatter.svelte +1 -1
  19. biopipen/reports/{delim/SampleInfo.svelte → common.svelte} +2 -3
  20. biopipen/reports/scrna/DimPlots.svelte +1 -1
  21. biopipen/reports/scrna_metabolic_landscape/MetabolicFeatures.svelte +51 -22
  22. biopipen/reports/scrna_metabolic_landscape/MetabolicPathwayActivity.svelte +46 -42
  23. biopipen/reports/scrna_metabolic_landscape/MetabolicPathwayHeterogeneity.svelte +63 -6
  24. biopipen/reports/snp/PlinkCallRate.svelte +2 -2
  25. biopipen/reports/snp/PlinkFreq.svelte +1 -1
  26. biopipen/reports/snp/PlinkHWE.svelte +1 -1
  27. biopipen/reports/snp/PlinkHet.svelte +1 -1
  28. biopipen/reports/snp/PlinkIBD.svelte +1 -1
  29. biopipen/reports/tcr/CDR3AAPhyschem.svelte +1 -1
  30. biopipen/scripts/bam/CNAClinic.R +41 -6
  31. biopipen/scripts/bam/CNVpytor.py +2 -1
  32. biopipen/scripts/bam/ControlFREEC.py +2 -3
  33. biopipen/scripts/bam/SamtoolsView.py +33 -0
  34. biopipen/scripts/cnv/AneuploidyScore.R +25 -13
  35. biopipen/scripts/cnv/AneuploidyScoreSummary.R +218 -163
  36. biopipen/scripts/cnv/TMADScore.R +4 -4
  37. biopipen/scripts/cnv/TMADScoreSummary.R +51 -84
  38. biopipen/scripts/cnvkit/CNVkitGuessBaits.py +3 -3
  39. biopipen/scripts/cnvkit/CNVkitHeatmap.py +3 -3
  40. biopipen/scripts/cnvkit/CNVkitReference.py +3 -3
  41. biopipen/scripts/delim/RowsBinder.R +1 -1
  42. biopipen/scripts/delim/SampleInfo.R +4 -1
  43. biopipen/scripts/gene/GeneNameConversion.R +14 -12
  44. biopipen/scripts/gsea/Enrichr.R +2 -2
  45. biopipen/scripts/gsea/FGSEA.R +184 -50
  46. biopipen/scripts/gsea/PreRank.R +3 -3
  47. biopipen/scripts/misc/Plot.R +80 -0
  48. biopipen/scripts/plot/VennDiagram.R +2 -2
  49. biopipen/scripts/protein/ProdigySummary.R +34 -27
  50. biopipen/scripts/regulatory/MotifAffinityTest.R +11 -9
  51. biopipen/scripts/regulatory/MotifAffinityTest_AtSNP.R +5 -5
  52. biopipen/scripts/regulatory/MotifAffinityTest_MotifBreakR.R +4 -4
  53. biopipen/scripts/regulatory/VariantMotifPlot.R +10 -8
  54. biopipen/scripts/regulatory/motifs-common.R +10 -9
  55. biopipen/scripts/rnaseq/Simulation-ESCO.R +14 -11
  56. biopipen/scripts/rnaseq/Simulation-RUVcorr.R +7 -4
  57. biopipen/scripts/rnaseq/Simulation.R +0 -2
  58. biopipen/scripts/rnaseq/UnitConversion.R +6 -5
  59. biopipen/scripts/scrna/AnnData2Seurat.R +25 -73
  60. biopipen/scripts/scrna/CellCellCommunication.py +1 -1
  61. biopipen/scripts/scrna/CellCellCommunicationPlots.R +51 -168
  62. biopipen/scripts/scrna/CellTypeAnnotation-celltypist.R +99 -150
  63. biopipen/scripts/scrna/CellTypeAnnotation-direct.R +11 -9
  64. biopipen/scripts/scrna/CellTypeAnnotation-hitype.R +12 -9
  65. biopipen/scripts/scrna/CellTypeAnnotation-sccatch.R +14 -11
  66. biopipen/scripts/scrna/CellTypeAnnotation-sctype.R +19 -16
  67. biopipen/scripts/scrna/CellTypeAnnotation.R +10 -2
  68. biopipen/scripts/scrna/CellsDistribution.R +1 -1
  69. biopipen/scripts/scrna/ExprImputation-alra.R +87 -11
  70. biopipen/scripts/scrna/ExprImputation-rmagic.R +247 -21
  71. biopipen/scripts/scrna/ExprImputation-scimpute.R +8 -5
  72. biopipen/scripts/scrna/MarkersFinder.R +481 -215
  73. biopipen/scripts/scrna/MetaMarkers.R +3 -3
  74. biopipen/scripts/scrna/ModuleScoreCalculator.R +14 -13
  75. biopipen/scripts/scrna/RadarPlots.R +1 -1
  76. biopipen/scripts/scrna/ScFGSEA.R +231 -76
  77. biopipen/scripts/scrna/ScSimulation.R +11 -10
  78. biopipen/scripts/scrna/ScVelo.py +605 -0
  79. biopipen/scripts/scrna/Seurat2AnnData.R +2 -3
  80. biopipen/scripts/scrna/SeuratClusterStats-clustree.R +1 -1
  81. biopipen/scripts/scrna/SeuratClusterStats-features.R +43 -30
  82. biopipen/scripts/scrna/SeuratClusterStats-ngenes.R +56 -65
  83. biopipen/scripts/scrna/SeuratClusterStats-stats.R +4 -4
  84. biopipen/scripts/scrna/SeuratClusterStats.R +9 -6
  85. biopipen/scripts/scrna/SeuratClustering.R +31 -48
  86. biopipen/scripts/scrna/SeuratLoading.R +2 -2
  87. biopipen/scripts/scrna/SeuratMap2Ref.R +66 -367
  88. biopipen/scripts/scrna/SeuratMetadataMutater.R +5 -7
  89. biopipen/scripts/scrna/SeuratPreparing.R +76 -24
  90. biopipen/scripts/scrna/SeuratSubClustering.R +46 -185
  91. biopipen/scripts/scrna/{SlingShot.R → Slingshot.R} +12 -16
  92. biopipen/scripts/scrna/Subset10X.R +2 -2
  93. biopipen/scripts/scrna/TopExpressingGenes.R +144 -185
  94. biopipen/scripts/scrna/celltypist-wrapper.py +6 -4
  95. biopipen/scripts/scrna/seurat_anndata_conversion.py +81 -0
  96. biopipen/scripts/scrna_metabolic_landscape/MetabolicFeatures.R +429 -123
  97. biopipen/scripts/scrna_metabolic_landscape/MetabolicPathwayActivity.R +346 -245
  98. biopipen/scripts/scrna_metabolic_landscape/MetabolicPathwayHeterogeneity.R +182 -173
  99. biopipen/scripts/snp/MatrixEQTL.R +39 -20
  100. biopipen/scripts/snp/PlinkCallRate.R +43 -34
  101. biopipen/scripts/snp/PlinkFreq.R +34 -41
  102. biopipen/scripts/snp/PlinkHWE.R +23 -18
  103. biopipen/scripts/snp/PlinkHet.R +26 -22
  104. biopipen/scripts/snp/PlinkIBD.R +30 -34
  105. biopipen/scripts/stats/ChowTest.R +9 -8
  106. biopipen/scripts/stats/DiffCoexpr.R +13 -11
  107. biopipen/scripts/stats/LiquidAssoc.R +7 -8
  108. biopipen/scripts/stats/Mediation.R +8 -8
  109. biopipen/scripts/stats/MetaPvalue.R +11 -13
  110. biopipen/scripts/stats/MetaPvalue1.R +6 -5
  111. biopipen/scripts/tcr/CDR3AAPhyschem.R +105 -164
  112. biopipen/scripts/tcr/ClonalStats.R +6 -5
  113. biopipen/scripts/tcr/CloneResidency.R +3 -3
  114. biopipen/scripts/tcr/CloneSizeQQPlot.R +2 -2
  115. biopipen/scripts/tcr/Immunarch2VDJtools.R +2 -2
  116. biopipen/scripts/tcr/ImmunarchFilter.R +3 -3
  117. biopipen/scripts/tcr/ImmunarchLoading.R +5 -5
  118. biopipen/scripts/tcr/ScRepCombiningExpression.R +39 -0
  119. biopipen/scripts/tcr/ScRepLoading.R +114 -92
  120. biopipen/scripts/tcr/TCRClusterStats.R +2 -2
  121. biopipen/scripts/tcr/TCRClustering.R +86 -97
  122. biopipen/scripts/tcr/TESSA.R +65 -115
  123. biopipen/scripts/tcr/VJUsage.R +5 -5
  124. biopipen/scripts/vcf/TruvariBenchSummary.R +15 -11
  125. biopipen/utils/common_docstrs.py +66 -63
  126. biopipen/utils/reporter.py +177 -0
  127. {biopipen-0.33.1.dist-info → biopipen-0.34.1.dist-info}/METADATA +2 -1
  128. {biopipen-0.33.1.dist-info → biopipen-0.34.1.dist-info}/RECORD +130 -145
  129. {biopipen-0.33.1.dist-info → biopipen-0.34.1.dist-info}/WHEEL +1 -1
  130. biopipen/reports/scrna/CellCellCommunicationPlots.svelte +0 -14
  131. biopipen/reports/scrna/ScFGSEA.svelte +0 -16
  132. biopipen/reports/scrna/SeuratClusterStats.svelte +0 -16
  133. biopipen/reports/scrna/SeuratMap2Ref.svelte +0 -37
  134. biopipen/reports/scrna/SeuratPreparing.svelte +0 -15
  135. biopipen/reports/scrna_metabolic_landscape/MetabolicFeaturesIntraSubset.svelte +0 -28
  136. biopipen/reports/utils/gsea.liq +0 -110
  137. biopipen/scripts/scrna/CellTypeAnnotation-common.R +0 -10
  138. biopipen/scripts/scrna/SeuratClustering-common.R +0 -213
  139. biopipen/scripts/scrna_metabolic_landscape/MetabolicFeaturesIntraSubset.R +0 -193
  140. biopipen/utils/caching.R +0 -44
  141. biopipen/utils/gene.R +0 -95
  142. biopipen/utils/gsea.R +0 -329
  143. biopipen/utils/io.R +0 -20
  144. biopipen/utils/misc.R +0 -602
  145. biopipen/utils/mutate_helpers.R +0 -581
  146. biopipen/utils/plot.R +0 -209
  147. biopipen/utils/repr.R +0 -146
  148. biopipen/utils/rnaseq.R +0 -48
  149. biopipen/utils/single_cell.R +0 -207
  150. {biopipen-0.33.1.dist-info → biopipen-0.34.1.dist-info}/entry_points.txt +0 -0
@@ -1,581 +0,0 @@
1
- suppressPackageStartupMessages(library(rlang))
2
- suppressPackageStartupMessages(library(tidyselect))
3
- suppressPackageStartupMessages(library(dplyr))
4
- suppressPackageStartupMessages(library(tidyr))
5
-
6
- #' Get expanded, collapsed, emerged or vanished clones from a meta data frame
7
- #'
8
- #' @rdname Get expanded, collapsed, emerged or vanished clones
9
- #'
10
- #' @param df The meta data frame
11
- #' @param group.by The column name (without quotes) in metadata to group the
12
- #' cells.
13
- #' @param idents The groups of cells to compare (values in `group-by` column).
14
- #' Either length 1 (`ident_1`) or length 2 (`ident_1` and `ident_2`).
15
- #' If length 1, the rest of the cells with non-NA values in `group.by` will
16
- #' be used as `ident_2`.
17
- #' @param subset An expression to subset the cells, will be passed to
18
- #' `dplyr::filter()`. Default is `TRUE` (no filtering).
19
- #' @param each A column name (without quotes) in metadata to split the cells.
20
- #' Each comparison will be done for each value in this column.
21
- #' @param id The column name (without quotes) in metadata for the
22
- #' group ids (i.e. `CDR3.aa`)
23
- #' @param compare Either a (numeric) column name (i.e. `Clones`, without quotes)
24
- #' in metadata to compare between groups, or `.n` to compare the
25
- #' number of cells in each group.
26
- #' @param fun The way to compare between groups. Either `"expanded"`,
27
- #' `"collapsed"`, `"emerged"` or `"vanished"`.
28
- #' @param uniq Whether to return unique ids or not. Default is `TRUE`.
29
- #' If `FALSE`, you can mutate the meta data frame with the returned ids.
30
- #' For example, `df %>% mutate(expanded = expanded(...))`.
31
- #' @param debug Return the transformed data frame with counts, predicates, sum, and diff.
32
- #' @param order The order of the returned ids. It could be `sum` or `diff`,
33
- #' which is the sum or diff of the `compare` between idents. Two kinds of
34
- #' modifiers can be added, including `desc` and `abs`. For example,
35
- #' `sum,desc` means the sum of `compare` between idents in descending order.
36
- #' Default is `diff,abs,desc`.
37
- #' It only works when `uniq` is `TRUE`. If `uniq` is `FALSE`, the returned
38
- #' ids will be in the same order as in `df`.
39
- #' @param include_emerged Whether to include emerged clones for the expanded clones.
40
- #' Default is `FALSE`. It only works for `"expanded"`.
41
- #' @param include_vanished Whether to include vanished clones for the collapsed clones.
42
- #' Default is `FALSE`. It only works for `"collapsed"`.
43
- #'
44
- #' @return A vector of expanded or collapsed clones (in `id` column)
45
- #' If uniq is `FALSE`, the vector will be the same length as `df`.
46
- #'
47
- #' @examples
48
- #' # Get expanded clones
49
- #' df <- tibble(
50
- #' Clones = c(10, 8, 1, 5, 9, 2, 3, 7, 6, 4, 9, 9),
51
- #' Source = c(
52
- #' "Tumor", "Normal", "Normal", "Normal", "Tumor", "Tumor",
53
- #' "Tumor", "Normal", "Normal", "Normal", NA, "X"
54
- #' ),
55
- #' CDR3.aa = c("A", "C", "B", "E", "D", "E", "E", "B", "B", "B", "A", "A")
56
- #' )
57
- #'
58
- #' expanded(df, Source, c("Tumor", "Normal"))
59
- #' # The transformed data frame looks like this:
60
- # CDR3.aa ..predicate ..sum ..diff
61
- # <chr> <lgl> <dbl> <dbl>
62
- # 1 A TRUE 10 10
63
- # 2 B FALSE 1 -1
64
- # 3 C FALSE 8 -8
65
- # 4 D TRUE 9 9
66
- # 5 E FALSE 7 -3
67
- #'
68
- #' # [1] "A" "D"
69
- #'
70
- #' # Get collapsed clones
71
- #' collapsed(df, Source, c("Tumor", "Normal"))
72
- #' # [1] "B" "C" "E"
73
- #'
74
- #' # Get emerged clones
75
- #' emerged(df, Source, c("Tumor", "Normal"))
76
- #' # [1] "A" "D"
77
- #'
78
- #' # Get vanished clones
79
- #' vanished(df, Source, c("Tumor", "Normal"))
80
- #' # [1] "B" "C"
81
- .size_compare <- function(
82
- df,
83
- group.by, # nolint
84
- idents,
85
- subset,
86
- id,
87
- compare,
88
- fun,
89
- each,
90
- uniq,
91
- order,
92
- debug
93
- ) {
94
- if (length(idents) == 1) {
95
- ident_1 <- idents[1]
96
- ident_2 <- NULL
97
- } else if (length(idents) == 2) {
98
- ident_1 <- idents[1]
99
- ident_2 <- idents[2]
100
- } else {
101
- stop("idents must be length 1 or 2")
102
- }
103
- if (is.null(ident_2)) ident_2 <- "<NULL>"
104
-
105
- if (is_empty(attr(group.by, ".Environment"))) {
106
- # Works if a (quoted) string passed
107
- group.by <- sym(as_name(group.by))
108
- }
109
- if (is_empty(attr(id, ".Environment"))) {
110
- id <- sym(as_name(id))
111
- }
112
- if (is_empty(attr(compare, ".Environment"))) {
113
- compare <- sym(as_name(compare))
114
- }
115
- compare_label <- as_name(compare)
116
- compare_is_count <- compare_label == '.n'
117
-
118
- if (!as_name(group.by) %in% colnames(df)) {
119
- stop(paste0(
120
- '`group.by` must be a column name in df. Got "',
121
- as_name(group.by),
122
- '"'
123
- ))
124
- }
125
-
126
- if (!compare_is_count && !compare_label %in% colnames(df)) {
127
- stop(paste0(
128
- "`compare` must be either a column name in df, or 'count'/'.n'. ",
129
- 'Got "',
130
- compare_label,
131
- '"'
132
- ))
133
- }
134
-
135
- predicate <- function(ident_1, ident_2) {
136
- if (fun == "expanded") {
137
- ident_1 > ident_2 && ident_2 > 0
138
- } else if (fun == "expanded+") {
139
- ident_1 > ident_2
140
- } else if (fun == "collapsed") {
141
- ident_1 < ident_2 && ident_1 > 0
142
- } else if (fun == "collapsed+") {
143
- ident_1 < ident_2
144
- } else if (fun == "emerged") {
145
- ident_1 > 0 && ident_2 == 0
146
- } else if (fun == "vanished") {
147
- ident_1 == 0 && ident_2 > 0
148
- }
149
- }
150
-
151
- # subset the data frame
152
- trans <- df %>%
153
- dplyr::filter(!!subset) %>%
154
- drop_na(!!id) %>%
155
- # # remove NA values in group.by column
156
- # dplyr::filter(!is.na(!!group.by)) %>%
157
- # mark the group.by column (as .group) as ident_1 or ident_2 or NA
158
- mutate(
159
- .group = if_else(
160
- !!group.by == ident_1,
161
- "ident_1",
162
- if_else(ident_2 != "<NULL>" & !!group.by != ident_2, NA, "ident_2")
163
- )
164
- ) %>%
165
- # remove NA values in ..group column
166
- drop_na(.group)
167
-
168
- if (is_empty(attr(each, ".Environment"))) {
169
- if (as_label(each) == "NULL") {
170
- each <- NULL
171
- } else {
172
- each <- sym(as_name(each))
173
- }
174
- }
175
- if (is.null(each)) {
176
- trans <- trans %>% group_by(!!id, .group)
177
- } else {
178
- trans <- trans %>% group_by(!!each, !!id, .group)
179
- }
180
-
181
- if (compare_is_count) {
182
- trans <- trans %>% summarise(.n = n(), .groups = "drop")
183
- } else {
184
- trans <- trans %>% summarise(.n = first(!!compare), .groups = "drop")
185
- }
186
-
187
- trans <- trans %>% pivot_wider(names_from = .group, values_from = .n) %>%
188
- replace_na(list(ident_1 = 0, ident_2 = 0)) %>%
189
- rowwise() %>%
190
- # add the predicates, sums and diffs
191
- mutate(
192
- .predicate = predicate(ident_1, ident_2),
193
- .sum = ident_1 + ident_2,
194
- .diff = ident_1 - ident_2
195
- ) %>%
196
- ungroup() %>%
197
- arrange(!!order)
198
-
199
- if (debug) {
200
- return(trans)
201
- }
202
-
203
- uniq_ids <- trans %>% filter(.predicate) %>% pull(!!id) %>% as.vector() %>% unique()
204
- if (uniq) {
205
- return(uniq_ids)
206
- }
207
-
208
- df %>%
209
- mutate(
210
- .group = if_else(
211
- !!group.by == ident_1,
212
- "ident_1",
213
- if_else(ident_2 != "<NULL>" & !!group.by != ident_2, NA, "ident_2")
214
- ),
215
- .out = if_else(!!id %in% uniq_ids & !!subset & !is.na(.group), !!id, NA)
216
- ) %>%
217
- pull(.out)
218
- }
219
-
220
- #' @export
221
- expanded <- function(
222
- df = .,
223
- group.by, # nolint
224
- idents,
225
- subset = TRUE,
226
- each = NULL,
227
- id = CDR3.aa,
228
- compare = .n,
229
- uniq = TRUE,
230
- debug = FALSE,
231
- order = desc(.sum),
232
- include_emerged = FALSE
233
- ) {
234
- lbl <- as_label(enquo(df))
235
- if (length(lbl) == 1 && lbl == ".") {
236
- df <- across(everything())
237
- }
238
- fun = if (include_emerged) "expanded+" else "expanded"
239
- .size_compare(
240
- df = df,
241
- group.by = enquo(group.by),
242
- idents = idents,
243
- subset = enexpr(subset),
244
- id = enquo(id),
245
- compare = enquo(compare),
246
- fun = fun,
247
- each = tryCatch(enquo(each), error = function(e) NULL),
248
- uniq = uniq,
249
- order = enexpr(order),
250
- debug = debug
251
- )
252
- }
253
-
254
- #' @export
255
- collapsed <- function(
256
- df = .,
257
- group.by, # nolint
258
- idents,
259
- subset = TRUE,
260
- each = NULL,
261
- id = CDR3.aa,
262
- compare = .n,
263
- uniq = TRUE,
264
- debug = FALSE,
265
- order = desc(.sum),
266
- include_vanished = FALSE
267
- ) {
268
- lbl <- as_label(enquo(df))
269
- if (length(lbl) == 1 && lbl == ".") {
270
- df <- across(everything())
271
- }
272
- fun = if (include_vanished) "collapsed+" else "collapsed"
273
- .size_compare(
274
- df = df,
275
- group.by = enquo(group.by),
276
- idents = idents,
277
- subset = enexpr(subset),
278
- id = enquo(id),
279
- compare = enquo(compare),
280
- fun = fun,
281
- each = tryCatch(enquo(each), error = function(e) NULL),
282
- uniq = uniq,
283
- order = enexpr(order),
284
- debug = debug
285
- )
286
- }
287
-
288
- #' @export
289
- emerged <- function(
290
- df = .,
291
- group.by, # nolint
292
- idents,
293
- subset = TRUE,
294
- each = NULL,
295
- id = CDR3.aa,
296
- compare = .n,
297
- uniq = TRUE,
298
- debug = FALSE,
299
- order = desc(.sum)
300
- ) {
301
- lbl <- as_label(enquo(df))
302
- if (length(lbl) == 1 && lbl == ".") {
303
- df <- across(everything())
304
- }
305
- .size_compare(
306
- df = df,
307
- group.by = enquo(group.by),
308
- idents = idents,
309
- subset = enexpr(subset),
310
- id = enquo(id),
311
- compare = enquo(compare),
312
- fun = "emerged",
313
- each = tryCatch(enquo(each), error = function(e) NULL),
314
- uniq = uniq,
315
- order = enexpr(order),
316
- debug = debug
317
- )
318
- }
319
-
320
- #' @export
321
- vanished <- function(
322
- df = .,
323
- group.by, # nolint
324
- idents,
325
- subset = TRUE,
326
- each = NULL,
327
- id = CDR3.aa,
328
- compare = .n,
329
- uniq = TRUE,
330
- debug = FALSE,
331
- order = desc(.sum)
332
- ) {
333
- lbl <- as_label(enquo(df))
334
- if (length(lbl) == 1 && lbl == ".") {
335
- df <- across(everything())
336
- }
337
- .size_compare(
338
- df = df,
339
- group.by = enquo(group.by),
340
- idents = idents,
341
- subset = enexpr(subset),
342
- id = enquo(id),
343
- compare = enquo(compare),
344
- fun = "vanished",
345
- each = tryCatch(enquo(each), error = function(e) NULL),
346
- uniq = uniq,
347
- order = enexpr(order),
348
- debug = debug
349
- )
350
- }
351
-
352
- #' Get paired entities from a data frame based on the other column
353
- #'
354
- #' @rdname Get paired entities
355
- #' @param df The data frame. Use `.` if the function is called in a dplyr pipe.
356
- #' @param id The column name in `df` for the ids to be returned in the
357
- #' final output
358
- #' @param compare The column name in `df` to compare the values for each
359
- #' id in `id`.
360
- #' @param idents The values in `compare` to compare. It could be either an
361
- #' an integer or a vector. If it is an integer, the number of values in
362
- #' `compare` must be the same as the integer for the `id` to be regarded
363
- #' as paired. If it is a vector, the values in `compare` must be the same
364
- #' as the values in `idents` for the `id` to be regarded as paired.
365
- #' @param uniq Whether to return unique ids or not. Default is `TRUE`.
366
- #' If `FALSE`, you can mutate the meta data frame with the returned ids.
367
- #' Non-paired ids will be `NA`.
368
- #' @return A vector of paired ids (in `id` column)
369
- #' @examples
370
- #' df <- tibble(
371
- #' id = c("A", "A", "B", "B", "C", "C", "D", "D"),
372
- #' compare = c(1, 2, 1, 1, 1, 2, 1, 2)
373
- #' )
374
- #' paired(df, id, compare, 2)
375
- #' # [1] "A" "B" "C" "D"
376
- #' paired(df, id, compare, c(1, 2))
377
- #' # [1] "A" "C" "D"
378
- #' paired(df, id, compare, c(1, 2), uniq = FALSE)
379
- #' # [1] "A" "A" NA NA "C" "C" "D" "D"
380
- #'
381
- paired <- function(
382
- df = .,
383
- id,
384
- compare,
385
- idents = 2,
386
- uniq = TRUE
387
- ) {
388
- lbl <- as_label(enquo(df))
389
- if (length(lbl) == 1 && lbl == ".") {
390
- df <- across(everything())
391
- }
392
-
393
- id <- enquo(id)
394
- compare <- enquo(compare)
395
- if (is_empty(attr(id, ".Environment"))) {
396
- id <- sym(as_name(id))
397
- }
398
- if (is_empty(attr(compare, ".Environment"))) {
399
- compare <- sym(as_name(compare))
400
- }
401
- if (!as_name(id) %in% colnames(df)) {
402
- stop(paste0(
403
- '`id` must be a column name in df. Got "',
404
- as_name(id),
405
- '"'
406
- ))
407
- }
408
- if (!as_name(compare) %in% colnames(df)) {
409
- stop(paste0(
410
- '`compare` must be a column name in df. Got "',
411
- as_name(compare),
412
- '"'
413
- ))
414
- }
415
-
416
- if (is.numeric(idents) && length(idents) == 1) {
417
- if (idents <= 1) {
418
- stop(paste0(
419
- '`idents` must be greater than 1. Got ',
420
- idents
421
- ))
422
- }
423
- out <- df %>%
424
- add_count(!!id, name = "..count") %>%
425
- mutate(..paired = if_else(..count == idents, !!id, NA))
426
- } else {
427
- if (length(idents) <= 1) {
428
- stop(paste0(
429
- '`idents` must be a vector with length greater than 1. Got ',
430
- length(idents)
431
- ))
432
- }
433
- out <- df %>%
434
- group_by(!!id) %>%
435
- mutate(
436
- ..paired = if_else(
437
- rep(setequal(!!compare, idents), n()),
438
- !!id,
439
- NA
440
- )
441
- ) %>%
442
- ungroup()
443
- }
444
-
445
- out <- out %>% pull(..paired)
446
- if (uniq) {
447
- return(out %>% na.omit() %>% unique() %>% as.vector())
448
- } else {
449
- return(out)
450
- }
451
- }
452
-
453
- #' @export
454
- #' @rdname Get top entities by size of group
455
- #' @param df The data frame. Use `.` if the function is called in a dplyr pipe.
456
- #' @param id The column name in `df` for the groups.
457
- #' @param compare The column name in `df` to compare the values for each group.
458
- #' It could be either a numeric column or `.n` to compare the number of
459
- #' entities in each group. If a column is passed, the values in the column
460
- #' must be numeric and the same in each group. This won't be checked.
461
- #' @param n The number of top entities to return. if `n` < 1, it will be
462
- #' regarded as the percentage of the total number of entities in each group
463
- #' (after subsetting or each applied).
464
- #' Specify 0 to return all entities.
465
- #' @param subset An expression to subset the entities, will be passed to
466
- #' `dplyr::filter()`. Default is `TRUE` (no filtering).
467
- #' @param with_ties Whether to return all entities with the same size as the
468
- #' last entity in the top list. Default is `FALSE`.
469
- #' @param each A column name (without quotes) in metadata to split the cells.
470
- #' @param debug Return the transformed data frame with counts and predicates
471
- #' @param uniq Whether to return unique ids or not. Default is `TRUE`.
472
- #' If `FALSE`, you can mutate the meta data frame with the returned ids.
473
- top <- function(
474
- df = .,
475
- id = CDR3.aa,
476
- n = 10,
477
- compare = .n,
478
- subset = TRUE,
479
- with_ties = FALSE,
480
- each = NULL,
481
- debug = FALSE,
482
- uniq = TRUE
483
- ) {
484
- lbl <- as_label(enquo(df))
485
- if (length(lbl) == 1 && lbl == ".") {
486
- df <- across(everything())
487
- }
488
-
489
- id <- enquo(id)
490
- compare <- enquo(compare)
491
- if (is.character(subset)) {
492
- subset <- parse_expr(subset)
493
- } else {
494
- subset <- enexpr(subset)
495
- }
496
-
497
- each <- tryCatch(enquo(each), error = function(e) NULL)
498
- if (is_empty(attr(id, ".Environment"))) {
499
- id <- sym(as_name(id))
500
- }
501
- if (is_empty(attr(compare, ".Environment"))) {
502
- compare <- sym(as_name(compare))
503
- }
504
- if (!as_name(id) %in% colnames(df)) {
505
- stop(paste0(
506
- '`id` must be a column name in df. Got "',
507
- as_name(id),
508
- '"'
509
- ))
510
- }
511
- if (!as_name(compare) %in% colnames(df) && as_name(compare) != '.n') {
512
- stop(paste0(
513
- '`compare` must be a column name in df. Got "',
514
- as_name(compare),
515
- '"'
516
- ))
517
- }
518
- if (is_empty(attr(each, ".Environment"))) {
519
- if (as_label(each) == "NULL") {
520
- each <- NULL
521
- } else {
522
- each <- sym(as_name(each))
523
- }
524
- }
525
- if (!is.null(each) && !as_name(each) %in% colnames(df)) {
526
- stop(paste0(
527
- '`each` must be a column name in df. Got "',
528
- as_name(each),
529
- '"'
530
- ))
531
- }
532
-
533
- subdf <- df %>% dplyr::filter(!!subset) %>% tidyr::drop_na(!!id)
534
-
535
- handle_one_each <- function(d) {
536
- if (!is.null(each)) {
537
- d <- d %>% group_by(!!each, !!id)
538
- } else {
539
- d <- d %>% group_by(!!id)
540
- }
541
- d <- d %>%
542
- dplyr::summarise(.n = dplyr::n(), .groups = "drop") %>%
543
- dplyr::arrange(dplyr::desc(!!compare))
544
-
545
- if (n > 0 && n < 1) {
546
- o <- d %>% dplyr::slice_max(prop = n, order_by = !!compare, with_ties = with_ties)
547
- } else if (n >= 1) {
548
- o <- d %>% dplyr::slice_max(n = n, order_by = !!compare, with_ties = with_ties)
549
- } else {
550
- o <- d
551
- }
552
- d %>% dplyr::mutate(.predicate = !!id %in% dplyr::pull(o, !!id))
553
- }
554
-
555
- if (is.null(each)) {
556
- out <- handle_one_each(subdf)
557
- } else {
558
- out <- subdf %>% dplyr::group_by(!!each) %>%
559
- dplyr::group_split() %>%
560
- purrr::map(handle_one_each) %>%
561
- dplyr::bind_rows()
562
- }
563
-
564
- if (isTRUE(debug)) {
565
- return(out)
566
- }
567
-
568
- uniq_ids <- out %>% dplyr::filter(.predicate) %>%
569
- dplyr::pull(!!id) %>% as.vector() %>% unique()
570
- if (isTRUE(uniq)) {
571
- return(uniq_ids)
572
- }
573
-
574
- df <- df %>% left_join(
575
- out,
576
- by = if(is.null(each)) as_name(id) else c(as_name(each), as_name(id)))
577
-
578
- df %>% dplyr::mutate(
579
- .out = if_else(.predicate & !!subset, !!id, NA)
580
- ) %>% dplyr::pull(.out)
581
- }