biopipen 0.27.4__py3-none-any.whl → 0.27.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of biopipen might be problematic. Click here for more details.

biopipen/__init__.py CHANGED
@@ -1 +1 @@
1
- __version__ = "0.27.4"
1
+ __version__ = "0.27.5"
biopipen/core/testing.py CHANGED
@@ -51,15 +51,16 @@ class PipelineSucceeded:
51
51
  pipen._succeeded = succeeded
52
52
 
53
53
 
54
- def get_pipeline(testfile, loglevel="debug", **kwargs):
54
+ def get_pipeline(testfile, loglevel="debug", enable_report=False, **kwargs):
55
55
  """Get a pipeline for a test file"""
56
56
  name, workdir, outdir = _get_test_dirs(testfile, False)
57
+ report_plugin_prefix = "+" if enable_report else "-"
57
58
  kws = {
58
59
  "name": name,
59
60
  "workdir": workdir,
60
61
  "outdir": outdir,
61
62
  "loglevel": loglevel,
62
- "plugins": [PipelineSucceeded, "-report"],
63
+ "plugins": [PipelineSucceeded, f"{report_plugin_prefix}report"],
63
64
  }
64
65
  kws.update(kwargs)
65
66
  return Pipen(**kws)
biopipen/ns/scrna.py CHANGED
@@ -122,6 +122,9 @@ class SeuratPreparing(Proc):
122
122
  genes.
123
123
  ///
124
124
 
125
+ cell_qc_per_sample (flag): Whether to perform cell QC per sample or not.
126
+ If `True`, the cell QC will be performed per sample, and the QC will be
127
+ applied to each sample before merging.
125
128
  gene_qc (ns): Filter genes.
126
129
  `gene_qc` is applied after `cell_qc`.
127
130
  - min_cells: The minimum number of cells that a gene must be
@@ -222,6 +225,7 @@ class SeuratPreparing(Proc):
222
225
  envs = {
223
226
  "ncores": config.misc.ncores,
224
227
  "cell_qc": None, # "nFeature_RNA > 200 & percent.mt < 5",
228
+ "cell_qc_per_sample": False,
225
229
  "gene_qc": {"min_cells": 0, "excludes": []},
226
230
  "use_sct": False,
227
231
  "no_integration": False,
@@ -1483,14 +1487,17 @@ class SeuratTo10X(Proc):
1483
1487
  srtobj: The seurat object in RDS
1484
1488
 
1485
1489
  Output:
1486
- outdir: The output directory
1490
+ outdir: The output directory.
1491
+ When `envs.split_by` is specified, the subdirectories will be
1492
+ created for each distinct value of the column.
1493
+ Otherwise, the matrices will be written to the output directory.
1487
1494
 
1488
1495
  Envs:
1489
1496
  version: The version of 10X format
1490
1497
  """
1491
1498
  input = "srtobj:file"
1492
1499
  output = "outdir:dir:{{in.srtobj | stem}}"
1493
- envs = {"version": "3"}
1500
+ envs = {"version": "3", "split_by": None}
1494
1501
  lang = config.lang.rscript
1495
1502
  script = "file://../scripts/scrna/SeuratTo10X.R"
1496
1503
 
@@ -81,7 +81,7 @@ do_one_features = function(name) {
81
81
  if (case$kind %in% c("ridge", "ridgeplot")) {
82
82
  case$kind = "ridge"
83
83
  if (is.null(case$cols)) {
84
- case$cols = pal_biopipen()(32)
84
+ case$cols = pal_biopipen()(n_uidents)
85
85
  }
86
86
  excluded_args = c(excluded_args, "split.by", "reduction")
87
87
  fn = RidgePlot
@@ -4,6 +4,7 @@ library(Seurat)
4
4
  library(future)
5
5
  library(bracer)
6
6
  library(ggplot2)
7
+ library(dplyr)
7
8
  library(tidyseurat)
8
9
 
9
10
  metafile = {{in.metafile | quote}}
@@ -49,6 +50,19 @@ if (!"RNAData" %in% meta_cols) {
49
50
  stop("Error: Column `RNAData` is not found in metafile.")
50
51
  }
51
52
 
53
+ samples = as.character(metadata$Sample)
54
+
55
+ # used for plotting
56
+ cell_qc_df = NULL
57
+
58
+ plotsdir = file.path(joboutdir, "plots")
59
+ dir.create(plotsdir, showWarnings = FALSE, recursive = TRUE)
60
+
61
+ # features for cell QC
62
+ feats = c(
63
+ "nFeature_RNA", "nCount_RNA",
64
+ "percent.mt", "percent.ribo", "percent.hb", "percent.plat"
65
+ )
52
66
 
53
67
  rename_files = function(e, sample, path) {
54
68
  tmpdatadir = file.path(joboutdir, "renamed", sample)
@@ -74,6 +88,143 @@ rename_files = function(e, sample, path) {
74
88
  Read10X(data.dir = tmpdatadir)
75
89
  }
76
90
 
91
+
92
+ perform_cell_qc <- function(sobj, per_sample = FALSE) {
93
+ log_prefix = ifelse(per_sample, " ", "- ")
94
+ log_info("{log_prefix}Adding metadata for QC ...")
95
+ sobj$percent.mt = PercentageFeatureSet(sobj, pattern = "^MT-")
96
+ sobj$percent.ribo = PercentageFeatureSet(sobj, pattern = "^RP[SL]")
97
+ sobj$percent.hb = PercentageFeatureSet(sobj, pattern = "^HB[^(P)]")
98
+ sobj$percent.plat = PercentageFeatureSet(sobj, pattern = "PECAM1|PF4")
99
+
100
+ if (is.null(envs$cell_qc) || length(envs$cell_qc) == 0) {
101
+ log_warn("{log_prefix}No cell QC criteria is provided. All cells will be kept.")
102
+ cell_qc = "TRUE"
103
+ } else {
104
+ cell_qc = envs$cell_qc
105
+ }
106
+
107
+ sobj = sobj %>% mutate(.QC = !!rlang::parse_expr(cell_qc))
108
+
109
+ if (is.null(cell_qc_df)) {
110
+ cell_qc_df <<- sobj@meta.data[, c("Sample", ".QC", feats), drop = FALSE]
111
+ } else {
112
+ cell_qc_df <<- rbind(cell_qc_df, sobj@meta.data[, c("Sample", ".QC", feats), drop = FALSE])
113
+ }
114
+
115
+ # Do the filtering
116
+ log_info("{log_prefix}Filtering cells using QC criteria ...")
117
+ sobj = sobj %>% filter(.QC)
118
+ sobj$.QC = NULL
119
+
120
+ return(sobj)
121
+ }
122
+
123
+ report_cell_qc = function(ngenes) {
124
+ # uses cell_qc_df
125
+
126
+ # Violin plots
127
+ log_info("- Plotting violin plots ...")
128
+ add_report(
129
+ list(
130
+ kind = "descr",
131
+ content = paste(
132
+ "The violin plots for each feature. The cells are grouped by sample.",
133
+ "The cells that fail the QC criteria are colored in red, and",
134
+ "the cells that pass the QC criteria are colored in black.",
135
+ "The cells that fail the QC criteria are filtered out in the returned Seurat object."
136
+ )
137
+ ),
138
+ h1 = "Violin Plots"
139
+ )
140
+ for (feat in feats) {
141
+ log_info(" For feature: {feat}")
142
+ vln_p <- ggplot(cell_qc_df, aes(x = Sample, y = !!sym(feat), color = .QC)) +
143
+ geom_violin(fill = "white", width = 0.5) +
144
+ geom_jitter(width = 0.2, height = 0, alpha = 0.5) +
145
+ scale_color_manual(values = c("#181818", pal_biopipen()(1)), breaks = c(TRUE, FALSE)) +
146
+ labs(x = "Sample", y = feat) +
147
+ theme_minimal()
148
+
149
+ vlnplot = file.path(plotsdir, paste0(slugify(feat), ".vln.png"))
150
+ png(
151
+ vlnplot,
152
+ width = 800 + length(samples) * 15, height = 600, res = 100
153
+ )
154
+ print(vln_p)
155
+ dev.off()
156
+
157
+ add_report(
158
+ list(
159
+ src = vlnplot,
160
+ name = feat,
161
+ descr = paste0("Distribution of ", feat, " for each sample.")
162
+ ),
163
+ h1 = "Violin Plots",
164
+ ui = "table_of_images"
165
+ )
166
+ }
167
+
168
+ # Scatter plots against nCount_RNA
169
+ log_info("- Plotting scatter plots ...")
170
+ add_report(
171
+ list(
172
+ kind = "descr",
173
+ content = paste(
174
+ "The scatter plots for each feature against nCount_RNA. ",
175
+ "The cells that fail the QC criteria are colored in red, and",
176
+ "the cells that pass the QC criteria are colored in black.",
177
+ "The cells that fail the QC criteria are filtered out in the returned Seurat object."
178
+ )
179
+ ),
180
+ h1 = "Scatter Plots"
181
+ )
182
+ for (feat in setdiff(feats, "nCount_RNA")) {
183
+ log_info(" For feature: {feat}, against nCount_RNA")
184
+ scat_p <- ggplot(cell_qc_df, aes(x = nCount_RNA, y = !!sym(feat), color = .QC)) +
185
+ geom_point() +
186
+ scale_color_manual(values = c("#181818", pal_biopipen()(1)), breaks = c(TRUE, FALSE)) +
187
+ labs(x = "nCount_RNA", y = feat) +
188
+ theme_minimal()
189
+
190
+ scatfile = file.path(plotsdir, paste0(slugify(feat), "-nCount_RNA.scatter.png"))
191
+ png(scatfile, width = 800, height = 600, res = 100)
192
+ print(scat_p)
193
+ dev.off()
194
+
195
+ add_report(
196
+ list(
197
+ src = scatfile,
198
+ name = paste0(feat, " vs nCount_RNA"),
199
+ descr = paste0("Scatter plot for ", feat, " against nCount_RNA")
200
+ ),
201
+ h1 = "Scatter Plots",
202
+ ui = "table_of_images"
203
+ )
204
+ }
205
+
206
+ # return the dim_df calculated from the cell_qc_df
207
+ rbind(
208
+ cell_qc_df %>%
209
+ # group_by(Sample) %>%
210
+ summarise(
211
+ when = "Before_Cell_QC",
212
+ nCells = dplyr::n(),
213
+ nGenes = ngenes
214
+ ) %>%
215
+ ungroup(),
216
+ cell_qc_df %>%
217
+ filter(.QC) %>%
218
+ # group_by(Sample) %>%
219
+ summarise(
220
+ when = "After_Cell_QC",
221
+ nCells = dplyr::n(),
222
+ nGenes = ngenes
223
+ ) %>%
224
+ ungroup()
225
+ )
226
+ }
227
+
77
228
  load_sample = function(sample) {
78
229
  log_info("- Loading sample: {sample} ...")
79
230
  mdata = as.data.frame(metadata)[metadata$Sample == sample, , drop=TRUE]
@@ -114,6 +265,11 @@ load_sample = function(sample) {
114
265
  obj[[mname]] = mdt
115
266
  }
116
267
 
268
+ if (isTRUE(envs$cell_qc_per_sample)) {
269
+ log_info("- Perform cell QC for sample: {sample} ...")
270
+ obj = perform_cell_qc(obj, TRUE)
271
+ }
272
+
117
273
  if (isTRUE(envs$use_sct)) {
118
274
  # so that we have data and scale.data layers on RNA assay
119
275
  # useful for visualization in case some genes are not in
@@ -126,125 +282,20 @@ load_sample = function(sample) {
126
282
  }
127
283
 
128
284
  # Load data
129
- samples = as.character(metadata$Sample)
130
-
131
285
  log_info("Reading samples individually ...")
132
286
  obj_list = lapply(samples, load_sample)
133
287
 
134
288
  log_info("Merging samples ...")
135
289
  sobj = Reduce(merge, obj_list)
136
290
 
137
- log_info("Adding metadata for QC ...")
138
- sobj$percent.mt = PercentageFeatureSet(sobj, pattern = "^MT-")
139
- sobj$percent.ribo = PercentageFeatureSet(sobj, pattern = "^RP[SL]")
140
- sobj$percent.hb = PercentageFeatureSet(sobj, pattern = "^HB[^(P)]")
141
- sobj$percent.plat = PercentageFeatureSet(sobj, pattern = "PECAM1|PF4")
142
-
143
- dim_df = data.frame(When = "Before_QC", nCells = ncol(sobj), nGenes = nrow(sobj))
144
-
145
- if (is.null(envs$cell_qc) || length(envs$cell_qc) == 0) {
146
- log_warn("No cell QC criteria is provided. All cells will be kept.")
147
- envs$cell_qc = "TRUE"
148
- }
149
-
150
- sobj = sobj %>% mutate(.QC = !!rlang::parse_expr(envs$cell_qc))
151
- feats = c("nFeature_RNA", "nCount_RNA", "percent.mt", "percent.ribo", "percent.hb", "percent.plat")
152
- plotsdir = file.path(joboutdir, "plots")
153
- dir.create(plotsdir, showWarnings = FALSE)
154
-
155
- # Violin plots
156
- log_info("Plotting violin plots ...")
157
- add_report(
158
- list(
159
- kind = "descr",
160
- content = paste(
161
- "The violin plots for each feature. The cells are grouped by sample.",
162
- "The cells that fail the QC criteria are colored in red, and",
163
- "the cells that pass the QC criteria are colored in black.",
164
- "The cells that fail the QC criteria are filtered out in the returned Seurat object."
165
- )
166
- ),
167
- h1 = "Violin Plots"
168
- )
169
- for (feat in feats) {
170
- log_info("- For feature: {feat}")
171
- vln_p = VlnPlot(
172
- sobj,
173
- cols = rep("white", length(samples)),
174
- group.by = "Sample",
175
- features = feat,
176
- pt.size = 0) + NoLegend()
177
- vln_p$data$.QC = sobj@meta.data$.QC
178
- vln_p = vln_p + geom_jitter(
179
- aes(color = .QC),
180
- data = vln_p$data,
181
- position = position_jitterdodge(jitter.width = 0.4, dodge.width = 0.9)
182
- ) + scale_color_manual(values = c("#181818", pal_biopipen()(1)), breaks = c(TRUE, FALSE))
183
-
184
- vlnplot = file.path(plotsdir, paste0(slugify(feat), ".vln.png"))
185
- png(
186
- vlnplot,
187
- width = 800 + length(samples) * 15, height = 600, res = 100
188
- )
189
- print(vln_p)
190
- dev.off()
191
-
192
- add_report(
193
- list(
194
- src = vlnplot,
195
- name = feat,
196
- descr = paste0("Distribution of ", feat, " for each sample.")
197
- ),
198
- h1 = "Violin Plots",
199
- ui = "table_of_images"
200
- )
201
- }
202
-
203
- # Scatter plots against nCount_RNA
204
- log_info("Plotting scatter plots ...")
205
- add_report(
206
- list(
207
- kind = "descr",
208
- content = paste(
209
- "The scatter plots for each feature against nCount_RNA. ",
210
- "The cells that fail the QC criteria are colored in red, and",
211
- "the cells that pass the QC criteria are colored in black.",
212
- "The cells that fail the QC criteria are filtered out in the returned Seurat object."
213
- )
214
- ),
215
- h1 = "Scatter Plots"
216
- )
217
- for (feat in setdiff(feats, "nCount_RNA")) {
218
- log_info("- For feature: {feat}, against nCount_RNA")
219
- scat_p = FeatureScatter(
220
- sobj,
221
- feature1 = "nCount_RNA",
222
- feature2 = feat,
223
- group.by = ".QC"
224
- ) +
225
- NoLegend() +
226
- scale_color_manual(values = c("#181818", pal_biopipen()(1)), breaks = c(TRUE, FALSE))
227
-
228
- scatfile = file.path(plotsdir, paste0(slugify(feat), "-nCount_RNA.scatter.png"))
229
- png(scatfile, width = 800, height = 600, res = 100)
230
- print(scat_p)
231
- dev.off()
232
-
233
- add_report(
234
- list(
235
- src = scatfile,
236
- name = paste0(feat, " vs nCount_RNA"),
237
- descr = paste0("Scatter plot for ", feat, " against nCount_RNA")
238
- ),
239
- h1 = "Scatter Plots",
240
- ui = "table_of_images"
241
- )
291
+ if (!envs$cell_qc_per_sample) {
292
+ log_info("Performing cell QC ...")
293
+ sobj = perform_cell_qc(sobj)
242
294
  }
243
295
 
244
- # Do the filtering
245
- log_info("Filtering cells using QC criteria ...")
246
- sobj = sobj %>% filter(.QC)
247
- sobj$.QC = NULL
296
+ # plot and report the QC
297
+ log_info("Plotting and reporting QC ...")
298
+ dim_df = report_cell_qc(nrow(sobj))
248
299
 
249
300
  log_info("Filtering genes ...")
250
301
  if (is.list(envs$gene_qc)) {
@@ -271,7 +322,7 @@ if (is.list(envs$gene_qc)) {
271
322
  dim_df = rbind(
272
323
  dim_df,
273
324
  data.frame(
274
- When = "After_Gene_QC",
325
+ when = "After_Gene_QC",
275
326
  nCells = ncol(sobj),
276
327
  nGenes = nrow(sobj)
277
328
  )
@@ -1,84 +1,27 @@
1
- library(Matrix)
2
-
3
- indir = {{in.indir | quote}}
4
- outdir = {{out.outdir | quote}}
5
- envs = {{envs | r}}
6
-
7
- set.seed(envs$seed)
8
- setwd(outdir)
9
-
10
- logger <- function(...) {
11
- cat(paste(..., "\n"), file=stderr())
12
- }
13
-
14
- # Find the data files
15
- mtx_file = Sys.glob(file.path(indir, "*matrix.mtx.gz"))
16
- feat_file = c(
17
- Sys.glob(file.path(indir, "*genes.tsv.gz")),
18
- Sys.glob(file.path(indir, "*features.tsv.gz"))
19
- )
20
- barcode_file = Sys.glob(file.path(indir, "*barcodes.tsv.gz"))
21
- if (length(mtx_file) == 0) {
22
- stop("No matrix file found in", indir)
23
- }
24
- if (length(mtx_file) > 1) {
25
- warning(paste("Multiple matrix files found in", indir, ", using the first one."))
26
- }
27
- if (length(feat_file) == 0) {
28
- stop("No feature file found in", indir)
29
- }
30
- if (length(feat_file) > 1) {
31
- warning(paste("Multiple feature files found in", indir, ", using the first one."))
32
- }
33
- if (length(barcode_file) == 0) {
34
- stop("No barcode file found in", indir)
35
- }
36
- if (length(barcode_file) > 1) {
37
- warning(paste("Multiple barcode files found in", indir, ", using the first one."))
38
- }
39
-
40
- mtx = readMM(mtx_file)
41
- n_feats = nrow(mtx)
42
- n_cells = ncol(mtx)
43
- logger("- Dimension: Features:", n_feats, ", Cells:", n_cells)
44
-
45
- if (envs$nfeats <= 1) {
46
- nfeats = as.integer(n_feats * envs$nfeats)
1
+ library(DropletUtils)
2
+ library(Seurat)
3
+
4
+ srtobjfile = {{in.srtobj | r}}
5
+ outdir = {{out.outdir | r}}
6
+ version = {{envs.version | r}}
7
+ split_by = {{envs.split_by | r}}
8
+
9
+ srtobj = readRDS(srtobjfile)
10
+ if (!is.null(split_by)) {
11
+ # check if split_by is a valid column
12
+ if (is.null(srtobj[[split_by]])) {
13
+ stop(paste0("Column ", split_by, " not found in Seurat object"))
14
+ }
15
+
16
+ # split Seurat object by split_by column
17
+ objs <- SplitObject(srtobj, split.by = split_by)
18
+ for (s in names(objs)) {
19
+ counts <- GetAssayData(object = objs[[s]], layer = "counts")
20
+ odir <- file.path(outdir, s)
21
+ dir.create(odir, recursive = TRUE, showWarnings = FALSE)
22
+ write10xCounts(odir, counts, version = version, overwrite = TRUE)
23
+ }
47
24
  } else {
48
- nfeats = envs$nfeats
49
- }
50
- if (envs$ncells <= 1) {
51
- ncells = as.integer(n_cells * envs$ncells)
52
- } else {
53
- ncells = envs$ncells
54
- }
55
-
56
- logger("- Identifying features to keep ...")
57
- feats = read.table(feat_file, header=FALSE, row.names=NULL, check.names=FALSE)
58
- feats_to_keep = c()
59
- if (length(envs$feats_to_keep) > 0) {
60
- feats_to_keep = match(envs$feats_to_keep, feats[,2])
25
+ counts = GetAssayData(object = srtobj, layer = "counts")
26
+ write10xCounts(outdir, counts, version = version, overwrite = TRUE)
61
27
  }
62
-
63
- out_feats = unique(c(sample(1:n_feats, nfeats), feats_to_keep))
64
- out_cells = sample(1:n_cells, ncells)
65
- logger("- Resulting in", length(out_feats), "features and", ncells, "cells")
66
-
67
- logger("- Subsetting matrix and saving it ...")
68
- out_mtx = mtx[out_feats, out_cells, drop=FALSE]
69
- out_mtx_file = file.path(outdir, "matrix.mtx")
70
- writeMM(out_mtx, out_mtx_file)
71
- system(paste("gzip", out_mtx_file))
72
-
73
- logger("- Subsetting features and saving it ...")
74
- out_feats = feats[out_feats, , drop=FALSE]
75
- out_feats_file = gzfile(file.path(outdir, "features.tsv.gz"), "w")
76
- write.table(out_feats, out_feats_file, sep="\t", row.names=FALSE, col.names=FALSE, quote=FALSE)
77
- close(out_feats_file)
78
-
79
- logger("- Subsetting barcodes and saving it ...")
80
- barcodes = read.table(barcode_file, header=FALSE, row.names=NULL, check.names=FALSE)
81
- out_barcodes = barcodes[out_cells, , drop=FALSE]
82
- out_barcodes_file = gzfile(file.path(outdir, "barcodes.tsv.gz"), "w")
83
- write.table(out_barcodes, out_barcodes_file, sep="\t", row.names=FALSE, col.names=FALSE, quote=FALSE)
84
- close(out_barcodes_file)
@@ -0,0 +1,84 @@
1
+ library(Matrix)
2
+
3
+ indir = {{in.indir | quote}}
4
+ outdir = {{out.outdir | quote}}
5
+ envs = {{envs | r}}
6
+
7
+ set.seed(envs$seed)
8
+ setwd(outdir)
9
+
10
+ logger <- function(...) {
11
+ cat(paste(..., "\n"), file=stderr())
12
+ }
13
+
14
+ # Find the data files
15
+ mtx_file = Sys.glob(file.path(indir, "*matrix.mtx.gz"))
16
+ feat_file = c(
17
+ Sys.glob(file.path(indir, "*genes.tsv.gz")),
18
+ Sys.glob(file.path(indir, "*features.tsv.gz"))
19
+ )
20
+ barcode_file = Sys.glob(file.path(indir, "*barcodes.tsv.gz"))
21
+ if (length(mtx_file) == 0) {
22
+ stop("No matrix file found in", indir)
23
+ }
24
+ if (length(mtx_file) > 1) {
25
+ warning(paste("Multiple matrix files found in", indir, ", using the first one."))
26
+ }
27
+ if (length(feat_file) == 0) {
28
+ stop("No feature file found in", indir)
29
+ }
30
+ if (length(feat_file) > 1) {
31
+ warning(paste("Multiple feature files found in", indir, ", using the first one."))
32
+ }
33
+ if (length(barcode_file) == 0) {
34
+ stop("No barcode file found in", indir)
35
+ }
36
+ if (length(barcode_file) > 1) {
37
+ warning(paste("Multiple barcode files found in", indir, ", using the first one."))
38
+ }
39
+
40
+ mtx = readMM(mtx_file)
41
+ n_feats = nrow(mtx)
42
+ n_cells = ncol(mtx)
43
+ logger("- Dimension: Features:", n_feats, ", Cells:", n_cells)
44
+
45
+ if (envs$nfeats <= 1) {
46
+ nfeats = as.integer(n_feats * envs$nfeats)
47
+ } else {
48
+ nfeats = envs$nfeats
49
+ }
50
+ if (envs$ncells <= 1) {
51
+ ncells = as.integer(n_cells * envs$ncells)
52
+ } else {
53
+ ncells = envs$ncells
54
+ }
55
+
56
+ logger("- Identifying features to keep ...")
57
+ feats = read.table(feat_file, header=FALSE, row.names=NULL, check.names=FALSE)
58
+ feats_to_keep = c()
59
+ if (length(envs$feats_to_keep) > 0) {
60
+ feats_to_keep = match(envs$feats_to_keep, feats[,2])
61
+ }
62
+
63
+ out_feats = unique(c(sample(1:n_feats, nfeats), feats_to_keep))
64
+ out_cells = sample(1:n_cells, ncells)
65
+ logger("- Resulting in", length(out_feats), "features and", ncells, "cells")
66
+
67
+ logger("- Subsetting matrix and saving it ...")
68
+ out_mtx = mtx[out_feats, out_cells, drop=FALSE]
69
+ out_mtx_file = file.path(outdir, "matrix.mtx")
70
+ writeMM(out_mtx, out_mtx_file)
71
+ system(paste("gzip", out_mtx_file))
72
+
73
+ logger("- Subsetting features and saving it ...")
74
+ out_feats = feats[out_feats, , drop=FALSE]
75
+ out_feats_file = gzfile(file.path(outdir, "features.tsv.gz"), "w")
76
+ write.table(out_feats, out_feats_file, sep="\t", row.names=FALSE, col.names=FALSE, quote=FALSE)
77
+ close(out_feats_file)
78
+
79
+ logger("- Subsetting barcodes and saving it ...")
80
+ barcodes = read.table(barcode_file, header=FALSE, row.names=NULL, check.names=FALSE)
81
+ out_barcodes = barcodes[out_cells, , drop=FALSE]
82
+ out_barcodes_file = gzfile(file.path(outdir, "barcodes.tsv.gz"), "w")
83
+ write.table(out_barcodes, out_barcodes_file, sep="\t", row.names=FALSE, col.names=FALSE, quote=FALSE)
84
+ close(out_barcodes_file)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: biopipen
3
- Version: 0.27.4
3
+ Version: 0.27.5
4
4
  Summary: Bioinformatics processes/pipelines that can be run from `pipen run`
5
5
  License: MIT
6
6
  Author: pwwang
@@ -1,11 +1,11 @@
1
- biopipen/__init__.py,sha256=FRehirBY8kLByuBXp81U_RUAg8WYLFropNPtg2RpV2w,23
1
+ biopipen/__init__.py,sha256=E1FuUUku2gzKP9EaIByX13BXhDU2SYE99gN_s2YdX7s,23
2
2
  biopipen/core/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
3
3
  biopipen/core/config.py,sha256=edK5xnDhM8j27srDzsxubi934NMrglLoKrdcC8qsEPk,1069
4
4
  biopipen/core/config.toml,sha256=20RCI30Peee1EQdfb_UbV3Hf74XUPndJnYZlUThytsw,1781
5
5
  biopipen/core/defaults.py,sha256=yPeehPLk_OYCf71IgRVCWuQRxLAMixDF81Ium0HtPKI,344
6
6
  biopipen/core/filters.py,sha256=HLrjXGsvvjRtTWIAmg_f4IMymWaRD769HlDwsCTh170,12424
7
7
  biopipen/core/proc.py,sha256=60lUP3PcUAaKbDETo9N5PEIoeOYrLgcSmuytmrhcx8g,912
8
- biopipen/core/testing.py,sha256=6BaHm8C7oHdnC5q14DBd0Qp1wqNxSexSFc5vUtHZjsw,3565
8
+ biopipen/core/testing.py,sha256=fZ8lzLwM5AhYapx0LDdYZPumqC0dj7GZpQuabhlqyGI,3665
9
9
  biopipen/ns/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
10
10
  biopipen/ns/bam.py,sha256=5AsYrB0mtr_mH6mCL6gjJ5rC4NywpjFkpFjUrBGp7Fk,9301
11
11
  biopipen/ns/bcftools.py,sha256=puCDfIL-1z6cz2y1Rlz-ESNIr8xJgeIjEQ440qicCvM,3467
@@ -21,7 +21,7 @@ biopipen/ns/gsea.py,sha256=EsNRAPYsagaV2KYgr4Jv0KCnZGqayM209v4yOGGTIOI,7423
21
21
  biopipen/ns/misc.py,sha256=fzn0pXvdghMkQhu-e3MMapPNMyO6IAJbtTzVU3GbFa0,3246
22
22
  biopipen/ns/plot.py,sha256=fzJAKKl4a_tsVkLREGCQTFVHP049m33LdWgeYRb6v7M,5483
23
23
  biopipen/ns/rnaseq.py,sha256=bKAa6friFWof4yDTWZQahm1MS-lrdetO1GqDKdfxXYc,7708
24
- biopipen/ns/scrna.py,sha256=u0p2eVqB2T7vpg-19NN0277QKChTsv9yxM6xQA6pJHg,103464
24
+ biopipen/ns/scrna.py,sha256=7Gs1xxQoGM3TKxaQvbgKNyMDEsgatFopImzC-RcOEoA,103946
25
25
  biopipen/ns/scrna_metabolic_landscape.py,sha256=EhOtHQyoH-jRpzDoOI_06UbjEg6mhvbDEHKhek01bPk,28334
26
26
  biopipen/ns/snp.py,sha256=EQ2FS0trQ7YThPmBVTpS66lc2OSfgQ6lCh6WnyP-C2g,5499
27
27
  biopipen/ns/stats.py,sha256=yJ6C1CXF84T7DDs9mgufqUOr89Rl6kybE5ji8Vnx6cw,13693
@@ -138,7 +138,7 @@ biopipen/scripts/scrna/SCImpute.R,sha256=dSJOHhmJ3x_72LBRXT72dbCti5oiB85CJ-OjWtq
138
138
  biopipen/scripts/scrna/ScFGSEA.R,sha256=2UCTCIydVkPGvn7WP-_fcE7857iKKDxY56-j-ruyO8o,6254
139
139
  biopipen/scripts/scrna/Seurat2AnnData.R,sha256=qz4u-B5J3GMwttubnNnByJXreziFbrP5Mak0L0q7eG0,1557
140
140
  biopipen/scripts/scrna/SeuratClusterStats-dimplots.R,sha256=gViDgQ8NorYD64iK0FgcODOrDOw0tExZmhuPRuLNp4g,2354
141
- biopipen/scripts/scrna/SeuratClusterStats-features.R,sha256=SaKTJloP1fttRXZQeb2ApX0ej7al13wOoEYkthSk13k,15489
141
+ biopipen/scripts/scrna/SeuratClusterStats-features.R,sha256=W7iYhaFsC5EMZLO50QukYPLYGK4bq9kQc1VT5FwvI68,15496
142
142
  biopipen/scripts/scrna/SeuratClusterStats-hists.R,sha256=YhuD-GePjJPSkR0iLRgV_hiGHD_bnOIKp-LB6GCwquo,5037
143
143
  biopipen/scripts/scrna/SeuratClusterStats-ngenes.R,sha256=GVKIXFNS_syCuSN8oxoBkjxxAeI5LdSxh-qLVkUsbDA,2146
144
144
  biopipen/scripts/scrna/SeuratClusterStats-stats.R,sha256=TxQ0OcLwXwIgwL1mTLArboK0ATJIJhxWiv9DV_jBlhE,9255
@@ -148,13 +148,13 @@ biopipen/scripts/scrna/SeuratFilter.R,sha256=BrYK0MLdaTtQvInMaQsmOt7oH_hlks0M1zy
148
148
  biopipen/scripts/scrna/SeuratLoading.R,sha256=ekWKnHIqtQb3kHVQiVymAHXXqiUxs6KKefjZKjaykmk,900
149
149
  biopipen/scripts/scrna/SeuratMap2Ref.R,sha256=Xn3VnvKqShuC0Ju05380wjuLVSdW0uWVzntdxjme244,4359
150
150
  biopipen/scripts/scrna/SeuratMetadataMutater.R,sha256=Pp4GsF3hZ6ZC2vroC3LSBmVa4B1p2L3hbh981yaAIeQ,1093
151
- biopipen/scripts/scrna/SeuratPreparing.R,sha256=c_aBM0mugBNyYJ5OjNVDR_Cj0sGqkiJZXCOk3pesFDk,16990
151
+ biopipen/scripts/scrna/SeuratPreparing.R,sha256=t6GOcc9ZNwpRLeES7uBWja9RF6u6k5I_TXcdK4Ve7d0,18683
152
152
  biopipen/scripts/scrna/SeuratSplit.R,sha256=vdK11V39_Uo_NaOh76QWCtxObGaEr5Ynxqq0hTiSvsU,754
153
153
  biopipen/scripts/scrna/SeuratSubClustering.R,sha256=L1SwKhNNKvsQGrcj0ZjScW9BLuvdO2pg7U48Ospsot8,6096
154
154
  biopipen/scripts/scrna/SeuratSubset.R,sha256=yVA11NVE2FSSw-DhxQcJRapns0tNNHdyDYi5epO6SKM,1776
155
- biopipen/scripts/scrna/SeuratTo10X.R,sha256=T2nJBTwOe12AIKC2FZsMSv6xx3s-67CYZokpz5wshqY,2679
155
+ biopipen/scripts/scrna/SeuratTo10X.R,sha256=1mh1R0Qlo1iHVrpMLUXyLDOA92QKJ4GzTMURTFRqsWg,901
156
+ biopipen/scripts/scrna/Subset10X.R,sha256=T2nJBTwOe12AIKC2FZsMSv6xx3s-67CYZokpz5wshqY,2679
156
157
  biopipen/scripts/scrna/TopExpressingGenes.R,sha256=kXMCYHVytgVgO_Uq66fKKFCFV2PPXE8VREy_0yYPLpU,7475
157
- biopipen/scripts/scrna/Write10X.R,sha256=OMhXvJwvaH-aWsMpijKrvXQVabc1qUu5ZEwiLAhkDeY,285
158
158
  biopipen/scripts/scrna/celltypist-wrapper.py,sha256=f5M8f4rU5nC7l17RS0YVmUPpLLz4D6PIcgWtA77UExM,1722
159
159
  biopipen/scripts/scrna/sctype.R,sha256=NaUJkABwF5G1UVm1CCtcMbwLSj94Mo24mbYCKFqo1Bw,6524
160
160
  biopipen/scripts/scrna_metabolic_landscape/MetabolicFeatures.R,sha256=b77yG5FeRse3bNfFgLIEYGHNZzydAn1OeyyR_n5Ju60,4790
@@ -240,7 +240,7 @@ biopipen/utils/reference.py,sha256=6bPSwQa-GiDfr7xLR9a5T64Ey40y24yn3QfQ5wDFZkU,4
240
240
  biopipen/utils/rnaseq.R,sha256=Ro2B2dG-Z2oVaT5tkwp9RHBz4dp_RF-JcizlM5GYXFs,1298
241
241
  biopipen/utils/single_cell.R,sha256=pJjYP8bIZpNAtTQ32rOXhZxaM1Y-6D-xUcK3pql9tbk,4316
242
242
  biopipen/utils/vcf.py,sha256=ajXs0M_QghEctlvUlSRjWQIABVF02wPdYd-0LP4mIsU,9377
243
- biopipen-0.27.4.dist-info/METADATA,sha256=jBHr-0G03oeihg4W1XgeY5gVb4rI-4chXNOt6wWhbJE,882
244
- biopipen-0.27.4.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
245
- biopipen-0.27.4.dist-info/entry_points.txt,sha256=wu70aoBcv1UahVbB_5237MY-9M9_mzqmWjDD-oi3yz0,621
246
- biopipen-0.27.4.dist-info/RECORD,,
243
+ biopipen-0.27.5.dist-info/METADATA,sha256=V-P-6i9I4Q1OE-KDY39Nkki_Iv_5jpP-65kxeUuCc88,882
244
+ biopipen-0.27.5.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
245
+ biopipen-0.27.5.dist-info/entry_points.txt,sha256=wu70aoBcv1UahVbB_5237MY-9M9_mzqmWjDD-oi3yz0,621
246
+ biopipen-0.27.5.dist-info/RECORD,,
@@ -1,11 +0,0 @@
1
- library(DropletUtils)
2
- library(Seurat)
3
-
4
- srtobjfile = {{in.srtobj | r}}
5
- outdir = {{out.outdir | r}}
6
- version = {{envs.version | r}}
7
-
8
- srtobj = readRDS(srtobjfile)
9
- counts = GetAssayData(object = srtobj, layer = "counts")
10
-
11
- write10xCounts(outdir, counts, version = version, overwrite = TRUE)