biopipen 0.32.3__py3-none-any.whl → 0.33.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of biopipen might be problematic. Click here for more details.
- biopipen/__init__.py +1 -1
- biopipen/core/config.toml +6 -0
- biopipen/core/filters.py +35 -23
- biopipen/core/testing.py +6 -1
- biopipen/ns/bam.py +39 -0
- biopipen/ns/cellranger.py +5 -0
- biopipen/ns/cellranger_pipeline.py +2 -2
- biopipen/ns/cnvkit_pipeline.py +4 -1
- biopipen/ns/delim.py +33 -27
- biopipen/ns/protein.py +99 -0
- biopipen/ns/scrna.py +411 -250
- biopipen/ns/snp.py +16 -3
- biopipen/ns/tcr.py +125 -1
- biopipen/ns/vcf.py +34 -0
- biopipen/ns/web.py +5 -1
- biopipen/reports/scrna/SeuratClusterStats.svelte +1 -1
- biopipen/reports/scrna/SeuratMap2Ref.svelte +15 -2
- biopipen/reports/tcr/ClonalStats.svelte +15 -0
- biopipen/reports/utils/misc.liq +20 -7
- biopipen/scripts/bam/BamMerge.py +2 -2
- biopipen/scripts/bam/BamSampling.py +4 -4
- biopipen/scripts/bam/BamSort.py +141 -0
- biopipen/scripts/bam/BamSplitChroms.py +10 -10
- biopipen/scripts/bam/BamSubsetByBed.py +3 -3
- biopipen/scripts/bam/CNVpytor.py +10 -10
- biopipen/scripts/bam/ControlFREEC.py +11 -11
- biopipen/scripts/bed/Bed2Vcf.py +5 -5
- biopipen/scripts/bed/BedConsensus.py +5 -5
- biopipen/scripts/bed/BedLiftOver.sh +6 -4
- biopipen/scripts/bed/BedtoolsIntersect.py +4 -4
- biopipen/scripts/bed/BedtoolsMakeWindows.py +3 -3
- biopipen/scripts/bed/BedtoolsMerge.py +4 -4
- biopipen/scripts/cellranger/CellRangerCount.py +20 -9
- biopipen/scripts/cellranger/CellRangerSummary.R +20 -29
- biopipen/scripts/cellranger/CellRangerVdj.py +8 -8
- biopipen/scripts/cnvkit/CNVkitAccess.py +6 -6
- biopipen/scripts/cnvkit/CNVkitAutobin.py +25 -18
- biopipen/scripts/cnvkit/CNVkitBatch.py +5 -5
- biopipen/scripts/cnvkit/CNVkitCall.py +3 -3
- biopipen/scripts/cnvkit/CNVkitCoverage.py +2 -2
- biopipen/scripts/cnvkit/CNVkitDiagram.py +5 -5
- biopipen/scripts/cnvkit/CNVkitFix.py +3 -3
- biopipen/scripts/cnvkit/CNVkitGuessBaits.py +9 -5
- biopipen/scripts/cnvkit/CNVkitHeatmap.py +4 -4
- biopipen/scripts/cnvkit/CNVkitReference.py +2 -2
- biopipen/scripts/cnvkit/CNVkitScatter.py +5 -5
- biopipen/scripts/cnvkit/CNVkitSegment.py +5 -5
- biopipen/scripts/cnvkit/guess_baits.py +166 -93
- biopipen/scripts/delim/SampleInfo.R +85 -148
- biopipen/scripts/misc/Config2File.py +2 -2
- biopipen/scripts/misc/Str2File.py +2 -2
- biopipen/scripts/protein/MMCIF2PDB.py +33 -0
- biopipen/scripts/protein/PDB2Fasta.py +60 -0
- biopipen/scripts/protein/Prodigy.py +4 -4
- biopipen/scripts/protein/RMSD.py +178 -0
- biopipen/scripts/regulatory/MotifScan.py +8 -8
- biopipen/scripts/scrna/CellCellCommunication.py +59 -22
- biopipen/scripts/scrna/MarkersFinder.R +273 -654
- biopipen/scripts/scrna/RadarPlots.R +73 -53
- biopipen/scripts/scrna/SCP-plot.R +15202 -0
- biopipen/scripts/scrna/ScVelo.py +0 -0
- biopipen/scripts/scrna/SeuratClusterStats-clustree.R +23 -31
- biopipen/scripts/scrna/SeuratClusterStats-dimplots.R +26 -54
- biopipen/scripts/scrna/SeuratClusterStats-features.R +85 -403
- biopipen/scripts/scrna/SeuratClusterStats-ngenes.R +32 -17
- biopipen/scripts/scrna/SeuratClusterStats-stats.R +45 -239
- biopipen/scripts/scrna/SeuratClusterStats.R +13 -19
- biopipen/scripts/scrna/SeuratMap2Ref.R +16 -12
- biopipen/scripts/scrna/SeuratPreparing.R +138 -81
- biopipen/scripts/scrna/SlingShot.R +71 -0
- biopipen/scripts/scrna/celltypist-wrapper.py +7 -6
- biopipen/scripts/snp/Plink2GTMat.py +26 -11
- biopipen/scripts/snp/PlinkFilter.py +7 -7
- biopipen/scripts/snp/PlinkFromVcf.py +8 -5
- biopipen/scripts/snp/PlinkSimulation.py +4 -4
- biopipen/scripts/snp/PlinkUpdateName.py +4 -4
- biopipen/scripts/stats/ChowTest.R +48 -22
- biopipen/scripts/tcgamaf/Maf2Vcf.py +2 -2
- biopipen/scripts/tcgamaf/MafAddChr.py +2 -2
- biopipen/scripts/tcr/ClonalStats.R +484 -0
- biopipen/scripts/tcr/ScRepLoading.R +127 -0
- biopipen/scripts/tcr/TCRDock.py +10 -6
- biopipen/scripts/tcr/vdjtools-patch.sh +1 -1
- biopipen/scripts/vcf/BcftoolsAnnotate.py +8 -8
- biopipen/scripts/vcf/BcftoolsFilter.py +3 -3
- biopipen/scripts/vcf/BcftoolsMerge.py +31 -0
- biopipen/scripts/vcf/BcftoolsSort.py +4 -4
- biopipen/scripts/vcf/BcftoolsView.py +5 -5
- biopipen/scripts/vcf/Vcf2Bed.py +2 -2
- biopipen/scripts/vcf/VcfAnno.py +11 -11
- biopipen/scripts/vcf/VcfDownSample.sh +22 -10
- biopipen/scripts/vcf/VcfFilter.py +5 -5
- biopipen/scripts/vcf/VcfFix.py +7 -7
- biopipen/scripts/vcf/VcfFix_utils.py +12 -3
- biopipen/scripts/vcf/VcfIndex.py +3 -3
- biopipen/scripts/vcf/VcfIntersect.py +3 -3
- biopipen/scripts/vcf/VcfLiftOver.sh +5 -0
- biopipen/scripts/vcf/VcfSplitSamples.py +4 -4
- biopipen/scripts/vcf/bcftools_utils.py +3 -3
- biopipen/scripts/web/Download.py +8 -4
- biopipen/scripts/web/DownloadList.py +5 -5
- biopipen/scripts/web/GCloudStorageDownloadBucket.py +5 -5
- biopipen/scripts/web/GCloudStorageDownloadFile.py +3 -3
- biopipen/scripts/web/gcloud_common.py +1 -1
- biopipen/utils/gsea.R +75 -35
- biopipen/utils/misc.R +205 -7
- biopipen/utils/misc.py +17 -8
- biopipen/utils/reference.py +11 -11
- biopipen/utils/repr.R +146 -0
- biopipen/utils/vcf.py +1 -1
- {biopipen-0.32.3.dist-info → biopipen-0.33.0.dist-info}/METADATA +8 -8
- {biopipen-0.32.3.dist-info → biopipen-0.33.0.dist-info}/RECORD +114 -105
- {biopipen-0.32.3.dist-info → biopipen-0.33.0.dist-info}/WHEEL +1 -1
- biopipen/scripts/scrna/SeuratClusterStats-hists.R +0 -144
- biopipen/scripts/scrna/SeuratPreparing-common.R +0 -467
- biopipen/scripts/scrna/SeuratPreparing-doublet_detection.R +0 -204
- {biopipen-0.32.3.dist-info → biopipen-0.33.0.dist-info}/entry_points.txt +0 -0
|
@@ -1,12 +1,9 @@
|
|
|
1
|
-
{{ biopipen_dir | joinpaths: "utils", "misc.R" | source_r }}
|
|
2
|
-
{{ biopipen_dir | joinpaths: "utils", "caching.R" | source_r }}
|
|
3
|
-
|
|
4
1
|
library(Seurat)
|
|
5
2
|
library(future)
|
|
6
3
|
library(bracer)
|
|
7
|
-
library(ggplot2)
|
|
8
4
|
library(dplyr)
|
|
9
|
-
|
|
5
|
+
library(glue)
|
|
6
|
+
library(biopipen.utils)
|
|
10
7
|
|
|
11
8
|
metafile <- {{in.metafile | quote}}
|
|
12
9
|
rdsfile <- {{out.rdsfile | quote}}
|
|
@@ -14,10 +11,9 @@ joboutdir <- {{job.outdir | quote}}
|
|
|
14
11
|
envs <- {{envs | r: todot = "-", skip = 1}}
|
|
15
12
|
|
|
16
13
|
if (isTRUE(envs$cache)) { envs$cache <- joboutdir }
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
}
|
|
14
|
+
|
|
15
|
+
log <- get_logger()
|
|
16
|
+
reporter <- get_reporter()
|
|
21
17
|
|
|
22
18
|
set.seed(8525)
|
|
23
19
|
# 8TB
|
|
@@ -26,15 +22,15 @@ options(future.rng.onMisuse="ignore")
|
|
|
26
22
|
options(Seurat.object.assay.version = "v5")
|
|
27
23
|
plan(strategy = "multicore", workers = envs$ncores)
|
|
28
24
|
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
add_report(
|
|
25
|
+
reporter$add(
|
|
32
26
|
list(
|
|
33
27
|
kind = "descr",
|
|
34
28
|
name = "Filters applied",
|
|
35
29
|
content = paste0(
|
|
36
30
|
"<p>Cell filters: ", html_escape(envs$cell_qc), "</p>",
|
|
37
|
-
"<p>Gene filters:
|
|
31
|
+
"<p>Gene filters: </p>",
|
|
32
|
+
"<p>- Min Cells: ", envs$gene_qc$min_cells, "</p>",
|
|
33
|
+
"<p>- Excludes: ", html_escape(envs$gene_qc$excludes %||% "Not set"), "</p>"
|
|
38
34
|
)
|
|
39
35
|
),
|
|
40
36
|
h1 = "Filters and QC"
|
|
@@ -48,16 +44,6 @@ metadata <- read.table(
|
|
|
48
44
|
check.names = FALSE
|
|
49
45
|
)
|
|
50
46
|
|
|
51
|
-
cache_sig <- capture.output(str(metadata))
|
|
52
|
-
dig_sig <- digest::digest(cache_sig, algo = "md5")
|
|
53
|
-
dig_sig <- substr(dig_sig, 1, 8)
|
|
54
|
-
cache_dir <- NULL
|
|
55
|
-
if (is.character(envs$cache)) {
|
|
56
|
-
cache_dir <- file.path(envs$cache, paste0(dig_sig, ".seuratpreparing_cache"))
|
|
57
|
-
dir.create(cache_dir, recursive = TRUE, showWarnings = FALSE)
|
|
58
|
-
writeLines(cache_sig, file.path(cache_dir, "signature.txt"))
|
|
59
|
-
}
|
|
60
|
-
|
|
61
47
|
meta_cols = colnames(metadata)
|
|
62
48
|
if (!"Sample" %in% meta_cols) {
|
|
63
49
|
stop("Error: Column `Sample` is not found in metafile.")
|
|
@@ -66,77 +52,148 @@ if (!"RNAData" %in% meta_cols) {
|
|
|
66
52
|
stop("Error: Column `RNAData` is not found in metafile.")
|
|
67
53
|
}
|
|
68
54
|
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
)
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
log_info("Plotting and reporting QC ...")
|
|
87
|
-
dim_df = report_cell_qc(nrow(sobj))
|
|
88
|
-
|
|
89
|
-
if (is.list(envs$gene_qc)) {
|
|
90
|
-
sobj <- run_gene_qc(sobj)
|
|
91
|
-
}
|
|
92
|
-
|
|
93
|
-
dim_df = rbind(
|
|
94
|
-
dim_df,
|
|
95
|
-
data.frame(
|
|
96
|
-
when = "After_Gene_QC",
|
|
97
|
-
nCells = ncol(sobj),
|
|
98
|
-
nGenes = nrow(sobj)
|
|
99
|
-
)
|
|
55
|
+
qcdir = file.path(joboutdir, "qc")
|
|
56
|
+
dir.create(qcdir, showWarnings = FALSE, recursive = TRUE)
|
|
57
|
+
|
|
58
|
+
sobj <- LoadSeuratAndPerformQC(
|
|
59
|
+
metadata,
|
|
60
|
+
per_sample_qc = envs$cell_qc_per_sample,
|
|
61
|
+
cell_qc = envs$cell_qc,
|
|
62
|
+
gene_qc = envs$gene_qc,
|
|
63
|
+
tmpdir = joboutdir,
|
|
64
|
+
log = log,
|
|
65
|
+
cache = envs$cache)
|
|
66
|
+
|
|
67
|
+
log$info("Saving dimension table ...")
|
|
68
|
+
dim_df <- data.frame(
|
|
69
|
+
when = c("Before QC", "After QC"),
|
|
70
|
+
nCells = c(nrow(sobj@misc$cell_qc_df), sum(sobj@misc$cell_qc_df$.QC)),
|
|
71
|
+
nGenes = c(sobj@misc$gene_qc$before, sobj@misc$gene_qc$after)
|
|
100
72
|
)
|
|
101
|
-
|
|
102
|
-
log_info("Saving dimension table ...")
|
|
103
|
-
write.table(dim_df, file = file.path(plotsdir, "dim.txt"),
|
|
73
|
+
write.table(dim_df, file = file.path(qcdir, "dim.txt"),
|
|
104
74
|
row.names = FALSE, quote = FALSE, sep = "\t")
|
|
105
75
|
|
|
106
|
-
|
|
76
|
+
reporter$add(
|
|
107
77
|
list(
|
|
108
78
|
kind = "descr",
|
|
109
|
-
content =
|
|
110
|
-
"The dimension table for the Seurat object. The table contains the number of cells and genes before and after QC."
|
|
111
|
-
)
|
|
79
|
+
content = "The dimension table for the Seurat object. The table contains the number of cells and genes before and after QC. Note that the cell QC is performed before gene QC."
|
|
112
80
|
),
|
|
113
81
|
list(
|
|
114
82
|
kind = "table",
|
|
115
|
-
data = list(path = file.path(
|
|
83
|
+
data = list(path = file.path(qcdir, "dim.txt"))
|
|
116
84
|
),
|
|
117
|
-
h1 = "Filters and QC"
|
|
85
|
+
h1 = "Filters and QC",
|
|
86
|
+
h2 = "Dimension table"
|
|
118
87
|
)
|
|
119
88
|
|
|
120
|
-
|
|
121
|
-
|
|
89
|
+
log$info("Visualizing QC metrics ...")
|
|
90
|
+
for (pname in names(envs$qc_plots)) {
|
|
91
|
+
args <- envs$qc_plots[[pname]]
|
|
92
|
+
args$kind <- args$kind %||% "cell"
|
|
93
|
+
args$devpars <- args$devpars %||% list()
|
|
94
|
+
args$more_formats <- args$more_formats %||% character()
|
|
95
|
+
args$save_code <- args$save_code %||% FALSE
|
|
96
|
+
extract_vars(args, "kind", "devpars", "more_formats", "save_code")
|
|
97
|
+
if (kind == "gene") kind <- "gene_qc"
|
|
98
|
+
if (kind == "cell") kind <- "cell_qc"
|
|
99
|
+
args$object <- sobj
|
|
100
|
+
plot_fn <- if (kind == "cell_qc") {
|
|
101
|
+
gglogger::register(VizSeuratCellQC)
|
|
102
|
+
} else {
|
|
103
|
+
gglogger::register(VizSeuratGeneQC)
|
|
104
|
+
}
|
|
105
|
+
p <- do_call(plot_fn, args)
|
|
106
|
+
prefix <- file.path(qcdir, paste0(slugify(pname), "_", kind))
|
|
107
|
+
save_plot(p, prefix, devpars, formats = c("png", more_formats))
|
|
108
|
+
if (save_code) {
|
|
109
|
+
save_plotcode(p, prefix,
|
|
110
|
+
setup = c("library(biopipen.utils)", "load('data.RData')", "invisible(list2env('args'))"),
|
|
111
|
+
"args",
|
|
112
|
+
auto_data_setup = FALSE)
|
|
113
|
+
}
|
|
114
|
+
reporter$add(
|
|
115
|
+
reporter$image(prefix, more_formats, save_code, kind = "image"),
|
|
116
|
+
h1 = "Filters and QC",
|
|
117
|
+
h2 = html_escape(pname)
|
|
118
|
+
)
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
sobj <- RunSeuratTransformation(
|
|
122
|
+
sobj,
|
|
123
|
+
use_sct = envs$use_sct,
|
|
124
|
+
SCTransformArgs = envs$SCTransform,
|
|
125
|
+
NormalizeDataArgs = envs$NormalizeData,
|
|
126
|
+
FindVariableFeaturesArgs = envs$FindVariableFeatures,
|
|
127
|
+
ScaleDataArgs = envs$ScaleData,
|
|
128
|
+
RunPCAArgs = envs$RunPCA,
|
|
129
|
+
log = log,
|
|
130
|
+
cache = envs$cache
|
|
131
|
+
)
|
|
132
|
+
sobj <- RunSeuratIntegration(
|
|
133
|
+
sobj,
|
|
134
|
+
no_integration = envs$no_integration,
|
|
135
|
+
IntegrateLayersArgs = envs$IntegrateLayers,
|
|
136
|
+
log = log,
|
|
137
|
+
cache = envs$cache
|
|
138
|
+
)
|
|
122
139
|
|
|
123
140
|
# This is the last step, doesn't need to be cached
|
|
124
|
-
if (!
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
141
|
+
if (!identical(envs$doublet_detector, "none")) {
|
|
142
|
+
dbldir <- file.path(joboutdir, "doublets")
|
|
143
|
+
dir.create(dbldir, showWarnings = FALSE, recursive = TRUE)
|
|
144
|
+
|
|
145
|
+
sobj <- RunSeuratDoubletDetection(
|
|
146
|
+
sobj,
|
|
147
|
+
tool = envs$doublet_detector,
|
|
148
|
+
DoubletFinderArgs = envs$DoubletFinder,
|
|
149
|
+
scDblFinderArgs = envs$scDblFinder,
|
|
150
|
+
filter = FALSE,
|
|
151
|
+
log = log,
|
|
152
|
+
cache = envs$cache
|
|
153
|
+
)
|
|
137
154
|
|
|
155
|
+
log$info("Visualizing doublet detection results ...")
|
|
156
|
+
if (identical(tolower(envs$doublet_detector), "doubletfinder")) {
|
|
157
|
+
p <- VizSeuratDoublets(sobj, plot_type = "pK", x_text_angle = 90)
|
|
158
|
+
save_plot(
|
|
159
|
+
p, file.path(dbldir, "doubletfinder_pk"),
|
|
160
|
+
devpars = list(res = 100, width = 800, height = 600),
|
|
161
|
+
formats = "png")
|
|
162
|
+
reporter$add(
|
|
163
|
+
list(
|
|
164
|
+
kind = "descr",
|
|
165
|
+
content = paste(
|
|
166
|
+
"The pK plot from DoubletFinder to select the optimal pK value.",
|
|
167
|
+
"See more at https://github.com/chris-mcginnis-ucsf/DoubletFinder"
|
|
168
|
+
)
|
|
169
|
+
),
|
|
170
|
+
list(
|
|
171
|
+
kind = "image",
|
|
172
|
+
src = file.path(dbldir, "doubletfinder_pk.png")
|
|
173
|
+
),
|
|
174
|
+
h1 = glue("Doublet detection using {envs$doublet_detector}"),
|
|
175
|
+
h2 = "BC metric vs pK"
|
|
176
|
+
)
|
|
177
|
+
}
|
|
178
|
+
|
|
179
|
+
for (pt in c("dim", "pie")) {
|
|
180
|
+
p <- VizSeuratDoublets(sobj, plot_type = pt)
|
|
181
|
+
save_plot(p, file.path(dbldir, paste0("doublets_", pt)), formats = "png")
|
|
182
|
+
|
|
183
|
+
reporter$add(
|
|
184
|
+
list(
|
|
185
|
+
src = file.path(dbldir, paste0("doublets_", pt, ".png")),
|
|
186
|
+
descr = ifelse(pt == "dim", "Dimention Reduction Plot", "Pie Chart")
|
|
187
|
+
),
|
|
188
|
+
h1 = glue("Doublet detection using {envs$doublet_detector}"),
|
|
189
|
+
h2 = "Doublets distribution",
|
|
190
|
+
ui = "table_of_images"
|
|
191
|
+
)
|
|
192
|
+
}
|
|
138
193
|
|
|
139
|
-
|
|
140
|
-
|
|
194
|
+
sobj <- subset(sobj, subset = !!sym(paste0(sobj@misc$doublets$tool, "_DropletType")) != "doublet")
|
|
195
|
+
}
|
|
141
196
|
|
|
142
|
-
|
|
197
|
+
log$info("Saving QC'ed seurat object ...")
|
|
198
|
+
reporter$save(joboutdir)
|
|
199
|
+
saveRDS(sobj, rdsfile)
|
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
{{ biopipen_dir | joinpaths: "utils", "misc.R" | source_r }}
|
|
2
|
+
|
|
3
|
+
library(rlang)
|
|
4
|
+
library(Seurat)
|
|
5
|
+
library(slingshot)
|
|
6
|
+
|
|
7
|
+
sobjfile <- {{in.sobjfile | r}}
|
|
8
|
+
outfile <- {{out.outfile | r}}
|
|
9
|
+
group_by <- {{envs.group_by | r}}
|
|
10
|
+
reduction <- {{envs.reduction | r}}
|
|
11
|
+
dims <- {{envs.dims | r}}
|
|
12
|
+
start <- {{envs.start | r}}
|
|
13
|
+
end <- {{envs.end | r}}
|
|
14
|
+
prefix <- {{envs.prefix | r}}
|
|
15
|
+
reverse <- {{envs.reverse | r}}
|
|
16
|
+
align_start <- {{envs.align_start | r}}
|
|
17
|
+
seed <- {{envs.seed | r}}
|
|
18
|
+
|
|
19
|
+
set.seed(seed)
|
|
20
|
+
if (is.null(group_by)) {
|
|
21
|
+
stop("envs.group_by is required")
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
log_info("Reading Seurat object ...")
|
|
25
|
+
srt <- readRDS(sobjfile)
|
|
26
|
+
|
|
27
|
+
if (!group_by %in% colnames(srt@meta.data)) {
|
|
28
|
+
stop(paste("Grouping column", group_by, "not found in the Seurat object"))
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
reduction <- reduction %||% DefaultDimReduc(srt)
|
|
32
|
+
dims <- expand_dims(dims)
|
|
33
|
+
|
|
34
|
+
if (is.null(prefix)) {
|
|
35
|
+
prefix <- ""
|
|
36
|
+
} else {
|
|
37
|
+
prefix <- paste0(prefix, "_")
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
log_info("Filtering cells in NA group_by ...")
|
|
41
|
+
srt_sub <- srt[, !is.na(srt[[group_by, drop = TRUE]])]
|
|
42
|
+
|
|
43
|
+
log_info("Running Slingshot ...")
|
|
44
|
+
sl <- slingshot(
|
|
45
|
+
data = as.data.frame(srt_sub[[reduction]]@cell.embeddings[, dims]),
|
|
46
|
+
clusterLabels = as.character(srt_sub[[group_by, drop = TRUE]]),
|
|
47
|
+
start.clus = start, end.clus = end
|
|
48
|
+
)
|
|
49
|
+
|
|
50
|
+
command <- pbmc_small@commands[[1]]
|
|
51
|
+
attr(command, "name") <- "SlingShot"
|
|
52
|
+
attr(command, "call.string") <- "slingshot(...)"
|
|
53
|
+
attr(command, "params") <- list()
|
|
54
|
+
srt@commands <- srt@commands %||% list()
|
|
55
|
+
srt@commands$Slingshot <- command
|
|
56
|
+
|
|
57
|
+
df <- as.data.frame(slingPseudotime(sl))
|
|
58
|
+
colnames(df) <- paste0(prefix, colnames(df))
|
|
59
|
+
if (isTRUE(reverse)) {
|
|
60
|
+
if (isTRUE(align_start)) {
|
|
61
|
+
df <- apply(df, 2, function(x) max(x, na.rm = TRUE) - x)
|
|
62
|
+
} else {
|
|
63
|
+
df <- max(df, na.rm = TRUE) - df
|
|
64
|
+
}
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
srt <- AddMetaData(srt, metadata = df)
|
|
68
|
+
srt <- AddMetaData(srt, metadata = slingBranchID(sl), col.name = paste0(prefix, "BranchID"))
|
|
69
|
+
|
|
70
|
+
log_info("Saving Seurat object ...")
|
|
71
|
+
saveRDS(srt, outfile)
|
|
@@ -7,14 +7,13 @@ parser.add_argument(
|
|
|
7
7
|
parser.add_argument("-o", "--output", required=True, help="Output file")
|
|
8
8
|
parser.add_argument("-m", "--model", required=True, help="Model file")
|
|
9
9
|
parser.add_argument(
|
|
10
|
-
"-v", "--majority_voting",
|
|
11
|
-
action="store_true",
|
|
12
|
-
help="Majority voting"
|
|
10
|
+
"-v", "--majority_voting", action="store_true", help="Majority voting"
|
|
13
11
|
)
|
|
14
12
|
parser.add_argument(
|
|
15
|
-
"-c",
|
|
13
|
+
"-c",
|
|
14
|
+
"--over_clustering",
|
|
16
15
|
default="seurat_clusters",
|
|
17
|
-
help="Over clustering. Ignored if the column does not exist."
|
|
16
|
+
help="Over clustering. Ignored if the column does not exist.",
|
|
18
17
|
)
|
|
19
18
|
|
|
20
19
|
|
|
@@ -44,7 +43,9 @@ if __name__ == "__main__":
|
|
|
44
43
|
|
|
45
44
|
if args.output.endswith(".h5ad"):
|
|
46
45
|
try:
|
|
47
|
-
out_adata._raw._var.rename(
|
|
46
|
+
out_adata._raw._var.rename( # type: ignore
|
|
47
|
+
columns={"_index": "features"}, inplace=True
|
|
48
|
+
)
|
|
48
49
|
del out_adata.raw
|
|
49
50
|
except (KeyError, AttributeError):
|
|
50
51
|
pass
|
|
@@ -3,15 +3,16 @@ from os import path
|
|
|
3
3
|
from glob import glob
|
|
4
4
|
from biopipen.utils.misc import run_command, logger
|
|
5
5
|
|
|
6
|
-
indir = {{in.indir |
|
|
7
|
-
outfile = {{out.outfile |
|
|
8
|
-
plink = {{envs.plink |
|
|
9
|
-
ncores = {{envs.ncores | repr}} # pyright: ignore
|
|
10
|
-
transpose = {{envs.transpose | repr}} # pyright: ignore
|
|
11
|
-
samid = {{envs.samid | repr}} # pyright: ignore
|
|
12
|
-
varid = {{envs.varid | repr}} # pyright: ignore
|
|
13
|
-
trans_chr = {{envs.trans_chr | repr}} # pyright: ignore
|
|
14
|
-
missing_id = {{envs.missing_id | repr}} # pyright: ignore
|
|
6
|
+
indir: str = {{in.indir | quote}} # noqa: E999 # pyright: ignore
|
|
7
|
+
outfile: str = {{out.outfile | quote}} # pyright: ignore
|
|
8
|
+
plink: str = {{envs.plink | quote}} # pyright: ignore
|
|
9
|
+
ncores: int = {{envs.ncores | repr}} # pyright: ignore
|
|
10
|
+
transpose: bool = {{envs.transpose | repr}} # pyright: ignore
|
|
11
|
+
samid: str = {{envs.samid | repr}} # pyright: ignore
|
|
12
|
+
varid: str = {{envs.varid | repr}} # pyright: ignore
|
|
13
|
+
trans_chr: dict = {{envs.trans_chr | repr}} # pyright: ignore
|
|
14
|
+
missing_id: str = {{envs.missing_id | repr}} # pyright: ignore
|
|
15
|
+
gtcoding: str = {{envs.gtcoding | repr}} # pyright: ignore
|
|
15
16
|
trans_chr = trans_chr or {}
|
|
16
17
|
|
|
17
18
|
bedfile = glob(path.join(indir, '*.bed'))
|
|
@@ -37,6 +38,14 @@ cmd = [
|
|
|
37
38
|
|
|
38
39
|
run_command(cmd, fg=True, env={"cwd": path.dirname(outfile)})
|
|
39
40
|
|
|
41
|
+
|
|
42
|
+
def _vcf_gtcoding(gt):
|
|
43
|
+
try:
|
|
44
|
+
return str(2 - int(gt))
|
|
45
|
+
except (ValueError, TypeError):
|
|
46
|
+
return "NA"
|
|
47
|
+
|
|
48
|
+
|
|
40
49
|
if not transpose: # rows are variants, columns are samples
|
|
41
50
|
# .traw file is created, tab-separated, with the following columns:
|
|
42
51
|
trawfile = output + ".traw"
|
|
@@ -82,7 +91,10 @@ if not transpose: # rows are variants, columns are samples
|
|
|
82
91
|
.replace('{ref}', ref)
|
|
83
92
|
.replace('{alt}', alt)
|
|
84
93
|
)
|
|
85
|
-
|
|
94
|
+
if gtcoding == "plink":
|
|
95
|
+
record = [variant] + line[6:]
|
|
96
|
+
else: # vcf
|
|
97
|
+
record = [variant] + [_vcf_gtcoding(x) for x in line[6:]]
|
|
86
98
|
fout.write('\t'.join(record) + '\n')
|
|
87
99
|
|
|
88
100
|
else:
|
|
@@ -129,5 +141,8 @@ else:
|
|
|
129
141
|
fid = line[0]
|
|
130
142
|
iid = line[1]
|
|
131
143
|
sam = samid.replace('{fid}', fid).replace('{iid}', iid)
|
|
132
|
-
|
|
144
|
+
if gtcoding == "plink":
|
|
145
|
+
record = [sam] + line[6:]
|
|
146
|
+
else: # vcf
|
|
147
|
+
record = [sam] + [_vcf_gtcoding(x) for x in line[6:]]
|
|
133
148
|
fout.write('\t'.join(record) + '\n')
|
|
@@ -1,17 +1,17 @@
|
|
|
1
|
-
|
|
1
|
+
from __future__ import annotations
|
|
2
2
|
|
|
3
3
|
from pathlib import Path
|
|
4
4
|
from biopipen.utils.misc import run_command, dict_to_cli_args, logger
|
|
5
5
|
|
|
6
|
-
indir = {{in.indir |
|
|
7
|
-
samples_file = {{in.samples_file |
|
|
8
|
-
variants_file = {{in.variants_file |
|
|
9
|
-
outdir = {{out.outdir |
|
|
6
|
+
indir: str = {{in.indir | quote}} # pyright: ignore # noqa: #999
|
|
7
|
+
samples_file = {{in.samples_file | quote}} # pyright: ignore
|
|
8
|
+
variants_file = {{in.variants_file | quote}} # pyright: ignore
|
|
9
|
+
outdir: str = {{out.outdir | quote}} # pyright: ignore
|
|
10
10
|
|
|
11
11
|
plink = {{envs.plink | repr}} # pyright: ignore
|
|
12
12
|
ncores = {{envs.ncores | repr}} # pyright: ignore
|
|
13
|
-
samples = {{envs.samples | repr}} # pyright: ignore
|
|
14
|
-
variants = {{envs.variants | repr}} # pyright: ignore
|
|
13
|
+
samples: list[str] | str = {{envs.samples | repr}} # pyright: ignore
|
|
14
|
+
variants: list[str] | str = {{envs.variants | repr}} # pyright: ignore
|
|
15
15
|
e_samples_file = {{envs.samples_file | repr}} # pyright: ignore
|
|
16
16
|
e_variants_file = {{envs.variants_file | repr}} # pyright: ignore
|
|
17
17
|
keep = {{envs.keep | repr}} # pyright: ignore
|
|
@@ -1,12 +1,14 @@
|
|
|
1
|
-
from
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from os import path, PathLike
|
|
2
4
|
from biopipen.core.filters import dict_to_cli_args
|
|
3
5
|
from biopipen.utils.reference import tabix_index
|
|
4
6
|
from biopipen.utils.misc import run_command
|
|
5
7
|
|
|
6
|
-
invcf = {{in.invcf |
|
|
7
|
-
outprefix = {{in.invcf | stem0 |
|
|
8
|
-
outdir = {{out.outdir |
|
|
9
|
-
args = {{envs | dict
|
|
8
|
+
invcf: str | PathLike = {{in.invcf | quote}} # noqa: E999 # pyright: ignore
|
|
9
|
+
outprefix: str = {{in.invcf | stem0 | quote}} # pyright: ignore
|
|
10
|
+
outdir: str = {{out.outdir | quote}} # pyright: ignore
|
|
11
|
+
args: dict = {{envs | dict}} # pyright: ignore
|
|
10
12
|
|
|
11
13
|
plink = args.pop("plink")
|
|
12
14
|
tabix = args.pop("tabix")
|
|
@@ -23,6 +25,7 @@ args.setdefault("max_alleles", 2)
|
|
|
23
25
|
# This makes it possible to keep the allele order in the output
|
|
24
26
|
# no need for plink2
|
|
25
27
|
# args["keep_allele_order"] = True
|
|
28
|
+
args.setdefault("keep_allele_order", True)
|
|
26
29
|
|
|
27
30
|
# resolve plink 1.x --set-missing-var-ids doesn't distinguish $1, $2,...
|
|
28
31
|
# for ref and alts
|
|
@@ -4,9 +4,9 @@ from slugify import slugify
|
|
|
4
4
|
from simpleconf import Config
|
|
5
5
|
from biopipen.utils.misc import logger, run_command, dict_to_cli_args
|
|
6
6
|
|
|
7
|
-
configfile = {{in.configfile |
|
|
8
|
-
outdir = {{out.outdir |
|
|
9
|
-
gtmatfile = {{out.gtmat |
|
|
7
|
+
configfile: str = {{in.configfile | quote}} # pyright: ignore # noqa: E999
|
|
8
|
+
outdir: str = {{out.outdir | quote}} # pyright: ignore
|
|
9
|
+
gtmatfile: str = {{out.gtmat | quote}} # pyright: ignore
|
|
10
10
|
config = Config.load(configfile)
|
|
11
11
|
|
|
12
12
|
default_nsnps = {{envs.nsnps | repr}} # pyright: ignore
|
|
@@ -21,7 +21,7 @@ default_maxfreq = {{envs.maxfreq | repr}} # pyright: ignore
|
|
|
21
21
|
default_hetodds = {{envs.hetodds | repr}} # pyright: ignore
|
|
22
22
|
default_homodds = {{envs.homodds | repr}} # pyright: ignore
|
|
23
23
|
default_missing = {{envs.missing | repr}} # pyright: ignore
|
|
24
|
-
default_args = {{envs.args | repr}} # pyright: ignore
|
|
24
|
+
default_args: dict = {{envs.args | repr}} # pyright: ignore
|
|
25
25
|
default_transpose_gtmat = {{envs.transpose_gtmat | repr}} # pyright: ignore
|
|
26
26
|
default_sample_prefix = {{envs.sample_prefix | repr}} # pyright: ignore
|
|
27
27
|
|
|
@@ -1,9 +1,9 @@
|
|
|
1
1
|
from pathlib import Path
|
|
2
2
|
from biopipen.utils.misc import run_command, dict_to_cli_args, logger
|
|
3
3
|
|
|
4
|
-
indir = {{in.indir |
|
|
5
|
-
namefile = {{in.namefile |
|
|
6
|
-
outdir = {{out.outdir |
|
|
4
|
+
indir: str = {{in.indir | quote}} # pyright: ignore # noqa: #999
|
|
5
|
+
namefile: str = {{in.namefile | quote}} # pyright: ignore
|
|
6
|
+
outdir: str = {{out.outdir | quote}} # pyright: ignore
|
|
7
7
|
plink = {{envs.plink | repr}} # pyright: ignore
|
|
8
8
|
bcftools = {{envs.bcftools | repr}} # pyright: ignore
|
|
9
9
|
ncores = {{envs.ncores | repr}} # pyright: ignore
|
|
@@ -111,7 +111,7 @@ if namefile.endswith(".vcf") or namefile.endswith(".vcf.gz"):
|
|
|
111
111
|
else:
|
|
112
112
|
info = readline(finfo)
|
|
113
113
|
|
|
114
|
-
namefile = namefile_tmp
|
|
114
|
+
namefile = str(namefile_tmp)
|
|
115
115
|
|
|
116
116
|
args = {
|
|
117
117
|
"": plink,
|
|
@@ -12,15 +12,17 @@ transpose_input <- {{envs.transpose_input | r}}
|
|
|
12
12
|
transpose_group <- {{envs.transpose_group | r}}
|
|
13
13
|
|
|
14
14
|
log_info("Reading input files ...")
|
|
15
|
-
indata <- read.table(infile, header = TRUE, sep = "\t", row.names = 1)
|
|
15
|
+
indata <- read.table(infile, header = TRUE, sep = "\t", row.names = 1, check.names = FALSE)
|
|
16
16
|
if (transpose_input) {
|
|
17
17
|
indata <- t(indata)
|
|
18
18
|
}
|
|
19
|
-
groupdata <- read.table(groupfile, header = TRUE, sep = "\t", row.names = 1)
|
|
19
|
+
groupdata <- read.table(groupfile, header = TRUE, sep = "\t", row.names = 1, check.names = FALSE)
|
|
20
20
|
if (transpose_group) {
|
|
21
21
|
groupdata <- t(groupdata)
|
|
22
22
|
}
|
|
23
|
-
|
|
23
|
+
allgroups = na.omit(unique(unlist(groupdata)))
|
|
24
|
+
|
|
25
|
+
fmldata <- read.table(fmlfile, header = TRUE, sep = "\t", row.names = NULL, check.names = FALSE)
|
|
24
26
|
colnames(fmldata)[1:2] <- c("Group", "Formula")
|
|
25
27
|
|
|
26
28
|
chow.test <- function(fml, grouping) {
|
|
@@ -63,26 +65,43 @@ chow.test <- function(fml, grouping) {
|
|
|
63
65
|
)
|
|
64
66
|
}
|
|
65
67
|
|
|
66
|
-
formatlm <- function(m) {
|
|
67
|
-
if (
|
|
68
|
-
coeff <- as.list(m$coefficients)
|
|
68
|
+
formatlm <- function(m, g = NULL, type = "coeff") {
|
|
69
|
+
if (is.null(g)) {
|
|
69
70
|
vars <- all.vars(m$terms)
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
71
|
+
if (type == "pval") {
|
|
72
|
+
df <- as.data.frame(summary(m)$coefficients)
|
|
73
|
+
terms <- unlist(sapply(na.omit(c(vars[2:length(vars)], '(Intercept)', 'N')), function(x) {
|
|
74
|
+
pv <- df[x, 4] %||% df[bQuote(x), 4]
|
|
75
|
+
if (x == 'N') {
|
|
76
|
+
paste0('N=', nrow(m$model))
|
|
77
|
+
} else if (is.null(pv)) {
|
|
78
|
+
NULL
|
|
79
|
+
} else {
|
|
80
|
+
l <- ifelse(x == '(Intercept)', '_', x)
|
|
81
|
+
paste0(l, '=', signif(pv, digits = 4))
|
|
82
|
+
}
|
|
83
|
+
}))
|
|
84
|
+
} else {
|
|
85
|
+
coeff <- as.list(m$coefficients)
|
|
86
|
+
terms <- unlist(sapply(na.omit(c(vars[2:length(vars)], '(Intercept)', 'N')), function(x) {
|
|
87
|
+
ce <- coeff[[x]] %||% coeff[[bQuote(x)]]
|
|
88
|
+
if (x == 'N') {
|
|
89
|
+
paste0('N=', nrow(m$model))
|
|
90
|
+
} else if (is.null(ce)) {
|
|
91
|
+
NULL
|
|
92
|
+
} else {
|
|
93
|
+
l <- ifelse(x == '(Intercept)', '_', x)
|
|
94
|
+
paste0(l, '=', round(ce, 3))
|
|
95
|
+
}
|
|
96
|
+
}))
|
|
97
|
+
}
|
|
81
98
|
paste(terms[!is.null(terms)], collapse = ', ')
|
|
82
99
|
} else {
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
100
|
+
gm <- m[[as.character(g)]]
|
|
101
|
+
if (is.null(gm)) {
|
|
102
|
+
return(NA)
|
|
103
|
+
}
|
|
104
|
+
formatlm(gm, type = type)
|
|
86
105
|
}
|
|
87
106
|
}
|
|
88
107
|
|
|
@@ -98,8 +117,15 @@ results <- do_call(rbind, lapply(
|
|
|
98
117
|
log_debug(" Running Chow test for formula: {fmlrow$Formula} (grouping = {fmlrow$Group})")
|
|
99
118
|
|
|
100
119
|
res <- chow.test(fmlrow$Formula, fmlrow$Group)
|
|
101
|
-
fmlrow$
|
|
102
|
-
|
|
120
|
+
fmlrow$Pooled_Coef <- formatlm(res$pooled.lm)
|
|
121
|
+
for (g in allgroups) {
|
|
122
|
+
fmlrow[[paste0("Group_", g, "_Coef")]] <- formatlm(res$group.lms, g)
|
|
123
|
+
}
|
|
124
|
+
# fmlrow$Groups <- formatlm(res$group.lms)
|
|
125
|
+
fmlrow$Pooled_Pval <- formatlm(res$pooled.lm, type="pval")
|
|
126
|
+
for (g in allgroups) {
|
|
127
|
+
fmlrow[[paste0("Group_", g, "_Pval")]] <- formatlm(res$group.lms, g, type="pval")
|
|
128
|
+
}
|
|
103
129
|
fmlrow$SSR <- res$group.ssr
|
|
104
130
|
fmlrow$SumSSR <- res$pooled.ssr
|
|
105
131
|
fmlrow$Fstat <- res$Fstat
|
|
@@ -1,12 +1,12 @@
|
|
|
1
1
|
from biopipen.utils.misc import run_command, dict_to_cli_args
|
|
2
2
|
|
|
3
|
-
infile = {{in.infile | quote}} # pyright: ignore
|
|
3
|
+
infile: str = {{in.infile | quote}} # pyright: ignore # noqa
|
|
4
4
|
outfile = {{out.outfile | quote}} # pyright: ignore
|
|
5
5
|
outdir = {{out.outdir | quote}} # pyright: ignore
|
|
6
6
|
perl = {{envs.perl | quote}} # pyright: ignore
|
|
7
7
|
ref = {{envs.ref | repr}} # pyright: ignore
|
|
8
8
|
samtools = {{envs.samtools | quote}} # pyright: ignore
|
|
9
|
-
args = {{envs.args |
|
|
9
|
+
args: dict = {{envs.args | dict}} # pyright: ignore
|
|
10
10
|
maf2vcf = {{biopipen_dir | append: "/scripts/tcgamaf/maf2vcf.pl" | repr}} # pyright: ignore
|
|
11
11
|
|
|
12
12
|
args['input-maf'] = infile
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
|
|
2
|
-
infile = {{in.infile | quote}} # pyright: ignore
|
|
3
|
-
outfile = {{out.outfile | quote}} # pyright: ignore
|
|
2
|
+
infile: str = {{in.infile | quote}} # pyright: ignore # noqa
|
|
3
|
+
outfile: str = {{out.outfile | quote}} # pyright: ignore
|
|
4
4
|
|
|
5
5
|
with open(infile) as fin, open(outfile, "w") as fout:
|
|
6
6
|
for line in fin:
|