biopipen 0.32.1__py3-none-any.whl → 0.33.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of biopipen might be problematic. Click here for more details.
- biopipen/__init__.py +1 -1
- biopipen/core/config.toml +6 -0
- biopipen/core/filters.py +77 -26
- biopipen/core/testing.py +6 -1
- biopipen/ns/bam.py +39 -0
- biopipen/ns/cellranger.py +5 -0
- biopipen/ns/cellranger_pipeline.py +2 -2
- biopipen/ns/cnvkit_pipeline.py +4 -1
- biopipen/ns/delim.py +33 -27
- biopipen/ns/protein.py +99 -0
- biopipen/ns/scrna.py +411 -250
- biopipen/ns/snp.py +16 -3
- biopipen/ns/tcr.py +125 -1
- biopipen/ns/vcf.py +34 -0
- biopipen/ns/web.py +5 -1
- biopipen/reports/scrna/SeuratClusterStats.svelte +1 -1
- biopipen/reports/scrna/SeuratMap2Ref.svelte +15 -2
- biopipen/reports/tcr/ClonalStats.svelte +15 -0
- biopipen/reports/utils/misc.liq +22 -7
- biopipen/scripts/bam/BamMerge.py +2 -2
- biopipen/scripts/bam/BamSampling.py +4 -4
- biopipen/scripts/bam/BamSort.py +141 -0
- biopipen/scripts/bam/BamSplitChroms.py +10 -10
- biopipen/scripts/bam/BamSubsetByBed.py +3 -3
- biopipen/scripts/bam/CNVpytor.py +10 -10
- biopipen/scripts/bam/ControlFREEC.py +11 -11
- biopipen/scripts/bed/Bed2Vcf.py +5 -5
- biopipen/scripts/bed/BedConsensus.py +5 -5
- biopipen/scripts/bed/BedLiftOver.sh +6 -4
- biopipen/scripts/bed/BedtoolsIntersect.py +4 -4
- biopipen/scripts/bed/BedtoolsMakeWindows.py +3 -3
- biopipen/scripts/bed/BedtoolsMerge.py +4 -4
- biopipen/scripts/cellranger/CellRangerCount.py +20 -9
- biopipen/scripts/cellranger/CellRangerSummary.R +20 -29
- biopipen/scripts/cellranger/CellRangerVdj.py +8 -8
- biopipen/scripts/cnvkit/CNVkitAccess.py +6 -6
- biopipen/scripts/cnvkit/CNVkitAutobin.py +25 -18
- biopipen/scripts/cnvkit/CNVkitBatch.py +5 -5
- biopipen/scripts/cnvkit/CNVkitCall.py +3 -3
- biopipen/scripts/cnvkit/CNVkitCoverage.py +2 -2
- biopipen/scripts/cnvkit/CNVkitDiagram.py +5 -5
- biopipen/scripts/cnvkit/CNVkitFix.py +3 -3
- biopipen/scripts/cnvkit/CNVkitGuessBaits.py +9 -5
- biopipen/scripts/cnvkit/CNVkitHeatmap.py +4 -4
- biopipen/scripts/cnvkit/CNVkitReference.py +2 -2
- biopipen/scripts/cnvkit/CNVkitScatter.py +5 -5
- biopipen/scripts/cnvkit/CNVkitSegment.py +5 -5
- biopipen/scripts/cnvkit/guess_baits.py +166 -93
- biopipen/scripts/delim/SampleInfo.R +85 -139
- biopipen/scripts/misc/Config2File.py +2 -2
- biopipen/scripts/misc/Str2File.py +2 -2
- biopipen/scripts/protein/MMCIF2PDB.py +33 -0
- biopipen/scripts/protein/PDB2Fasta.py +60 -0
- biopipen/scripts/protein/Prodigy.py +4 -4
- biopipen/scripts/protein/RMSD.py +178 -0
- biopipen/scripts/regulatory/MotifScan.py +8 -8
- biopipen/scripts/scrna/CellCellCommunication.py +59 -22
- biopipen/scripts/scrna/CellsDistribution.R +31 -6
- biopipen/scripts/scrna/MarkersFinder.R +272 -602
- biopipen/scripts/scrna/MetaMarkers.R +16 -7
- biopipen/scripts/scrna/RadarPlots.R +75 -35
- biopipen/scripts/scrna/SCP-plot.R +15202 -0
- biopipen/scripts/scrna/ScVelo.py +0 -0
- biopipen/scripts/scrna/SeuratClusterStats-clustree.R +23 -25
- biopipen/scripts/scrna/SeuratClusterStats-dimplots.R +26 -47
- biopipen/scripts/scrna/SeuratClusterStats-features.R +85 -385
- biopipen/scripts/scrna/SeuratClusterStats-ngenes.R +33 -13
- biopipen/scripts/scrna/SeuratClusterStats-stats.R +45 -228
- biopipen/scripts/scrna/SeuratClusterStats.R +13 -19
- biopipen/scripts/scrna/SeuratMap2Ref.R +16 -6
- biopipen/scripts/scrna/SeuratPreparing.R +138 -81
- biopipen/scripts/scrna/SlingShot.R +71 -0
- biopipen/scripts/scrna/TopExpressingGenes.R +9 -7
- biopipen/scripts/scrna/celltypist-wrapper.py +7 -6
- biopipen/scripts/snp/Plink2GTMat.py +26 -11
- biopipen/scripts/snp/PlinkFilter.py +7 -7
- biopipen/scripts/snp/PlinkFromVcf.py +8 -5
- biopipen/scripts/snp/PlinkSimulation.py +4 -4
- biopipen/scripts/snp/PlinkUpdateName.py +4 -4
- biopipen/scripts/stats/ChowTest.R +48 -22
- biopipen/scripts/tcgamaf/Maf2Vcf.py +2 -2
- biopipen/scripts/tcgamaf/MafAddChr.py +2 -2
- biopipen/scripts/tcr/CDR3AAPhyschem.R +12 -2
- biopipen/scripts/tcr/ClonalStats.R +484 -0
- biopipen/scripts/tcr/CloneResidency.R +23 -5
- biopipen/scripts/tcr/Immunarch-basic.R +8 -1
- biopipen/scripts/tcr/Immunarch-clonality.R +5 -0
- biopipen/scripts/tcr/Immunarch-diversity.R +25 -4
- biopipen/scripts/tcr/Immunarch-geneusage.R +15 -1
- biopipen/scripts/tcr/Immunarch-kmer.R +14 -1
- biopipen/scripts/tcr/Immunarch-overlap.R +15 -1
- biopipen/scripts/tcr/Immunarch-spectratyping.R +10 -1
- biopipen/scripts/tcr/Immunarch-tracking.R +6 -0
- biopipen/scripts/tcr/Immunarch-vjjunc.R +33 -0
- biopipen/scripts/tcr/ScRepLoading.R +127 -0
- biopipen/scripts/tcr/TCRClusterStats.R +24 -7
- biopipen/scripts/tcr/TCRDock.py +10 -6
- biopipen/scripts/tcr/TESSA.R +6 -1
- biopipen/scripts/tcr/vdjtools-patch.sh +1 -1
- biopipen/scripts/vcf/BcftoolsAnnotate.py +8 -8
- biopipen/scripts/vcf/BcftoolsFilter.py +3 -3
- biopipen/scripts/vcf/BcftoolsMerge.py +31 -0
- biopipen/scripts/vcf/BcftoolsSort.py +4 -4
- biopipen/scripts/vcf/BcftoolsView.py +5 -5
- biopipen/scripts/vcf/Vcf2Bed.py +2 -2
- biopipen/scripts/vcf/VcfAnno.py +11 -11
- biopipen/scripts/vcf/VcfDownSample.sh +22 -10
- biopipen/scripts/vcf/VcfFilter.py +5 -5
- biopipen/scripts/vcf/VcfFix.py +7 -7
- biopipen/scripts/vcf/VcfFix_utils.py +12 -3
- biopipen/scripts/vcf/VcfIndex.py +3 -3
- biopipen/scripts/vcf/VcfIntersect.py +3 -3
- biopipen/scripts/vcf/VcfLiftOver.sh +5 -0
- biopipen/scripts/vcf/VcfSplitSamples.py +4 -4
- biopipen/scripts/vcf/bcftools_utils.py +3 -3
- biopipen/scripts/web/Download.py +8 -4
- biopipen/scripts/web/DownloadList.py +5 -5
- biopipen/scripts/web/GCloudStorageDownloadBucket.py +5 -5
- biopipen/scripts/web/GCloudStorageDownloadFile.py +3 -3
- biopipen/scripts/web/gcloud_common.py +1 -1
- biopipen/utils/gsea.R +96 -42
- biopipen/utils/misc.R +205 -7
- biopipen/utils/misc.py +17 -8
- biopipen/utils/plot.R +53 -17
- biopipen/utils/reference.py +11 -11
- biopipen/utils/repr.R +146 -0
- biopipen/utils/vcf.py +1 -1
- {biopipen-0.32.1.dist-info → biopipen-0.33.0.dist-info}/METADATA +9 -9
- {biopipen-0.32.1.dist-info → biopipen-0.33.0.dist-info}/RECORD +131 -122
- {biopipen-0.32.1.dist-info → biopipen-0.33.0.dist-info}/WHEEL +1 -1
- biopipen/scripts/scrna/SeuratClusterStats-hists.R +0 -139
- biopipen/scripts/scrna/SeuratPreparing-common.R +0 -452
- biopipen/scripts/scrna/SeuratPreparing-doublet_detection.R +0 -201
- {biopipen-0.32.1.dist-info → biopipen-0.33.0.dist-info}/entry_points.txt +0 -0
|
@@ -1,14 +1,11 @@
|
|
|
1
|
-
{{ biopipen_dir | joinpaths: "utils", "misc.R" | source_r }}
|
|
2
|
-
{{ biopipen_dir | joinpaths: "utils", "mutate_helpers.R" | source_r }}
|
|
3
|
-
|
|
4
1
|
library(rlang)
|
|
5
2
|
library(dplyr)
|
|
6
|
-
library(
|
|
7
|
-
library(
|
|
8
|
-
library(ggrepel)
|
|
3
|
+
library(biopipen.utils)
|
|
4
|
+
library(plotthis)
|
|
9
5
|
|
|
10
6
|
infile <- {{in.infile | r}}
|
|
11
7
|
outfile <- {{out.outfile | r}}
|
|
8
|
+
joboutdir <- {{job.outdir | r}}
|
|
12
9
|
sep <- {{envs.sep | r}}
|
|
13
10
|
mutaters <- {{envs.mutaters | r}}
|
|
14
11
|
save_mutated <- {{envs.save_mutated | r}}
|
|
@@ -16,6 +13,9 @@ defaults <- {{envs.defaults | r}}
|
|
|
16
13
|
stats <- {{envs.stats | r}}
|
|
17
14
|
exclude_cols <- {{envs.exclude_cols | r}}
|
|
18
15
|
|
|
16
|
+
log <- get_logger()
|
|
17
|
+
reporter <- get_reporter()
|
|
18
|
+
|
|
19
19
|
if (is.null(exclude_cols)) {
|
|
20
20
|
exclude_cols <- c()
|
|
21
21
|
} else {
|
|
@@ -29,6 +29,50 @@ if (colnames(indata)[1] == "row.names") {
|
|
|
29
29
|
stop("Wrong number of column names. Do you have the right `sep`?")
|
|
30
30
|
}
|
|
31
31
|
|
|
32
|
+
#' Get plotthis function from plot_type
|
|
33
|
+
#'
|
|
34
|
+
#' @param plot_type The plot type
|
|
35
|
+
#' @param gglogger_register Register the plotthis function to gglogger
|
|
36
|
+
#' @param return_name Return the name of the function instead of the function
|
|
37
|
+
#' @return The plotthis function
|
|
38
|
+
#' @export
|
|
39
|
+
get_plotthis_fn <- function(plot_type, gglogger_register = TRUE, return_name = FALSE) {
|
|
40
|
+
fn_name <- switch(plot_type,
|
|
41
|
+
hist = "Histogram",
|
|
42
|
+
histo = "Histogram",
|
|
43
|
+
histogram = "Histogram",
|
|
44
|
+
featuredim = "FeatureDimPlot",
|
|
45
|
+
splitbar = "SplitBarPlot",
|
|
46
|
+
enrichmap = "EnrichMap",
|
|
47
|
+
enrichnet = "EnrichNetwork",
|
|
48
|
+
enrichnetwork = "EnrichNetwork",
|
|
49
|
+
gsea = "GSEAPlot",
|
|
50
|
+
gseasummary = "GSEASummaryPlot",
|
|
51
|
+
gseasum = "GSEASummaryPlot",
|
|
52
|
+
heatmap = "Heatmap",
|
|
53
|
+
network = "Network",
|
|
54
|
+
pie = "PieChart",
|
|
55
|
+
wordcloud = "WordCloudPlot",
|
|
56
|
+
venn = "VennDiagram",
|
|
57
|
+
paste0(tools::toTitleCase(plot_type), "Plot")
|
|
58
|
+
)
|
|
59
|
+
if (return_name) {
|
|
60
|
+
return(fn_name)
|
|
61
|
+
}
|
|
62
|
+
fn <- tryCatch({
|
|
63
|
+
utils::getFromNamespace(fn_name, "plotthis")
|
|
64
|
+
}, error = function(e) {
|
|
65
|
+
stop("Unknown plot type: ", plot_type)
|
|
66
|
+
})
|
|
67
|
+
|
|
68
|
+
if (gglogger_register) {
|
|
69
|
+
gglogger::register(fn, fn_name)
|
|
70
|
+
} else {
|
|
71
|
+
fn
|
|
72
|
+
}
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
log$info("Applying mutaters to the data if any ...")
|
|
32
76
|
if (!is.null(mutaters) && length(mutaters) > 0) {
|
|
33
77
|
mutdata <- indata %>%
|
|
34
78
|
mutate(!!!lapply(mutaters, parse_expr))
|
|
@@ -44,7 +88,9 @@ write.table(
|
|
|
44
88
|
col.names = TRUE,
|
|
45
89
|
quote = FALSE
|
|
46
90
|
)
|
|
47
|
-
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
reporter$add(
|
|
48
94
|
list(
|
|
49
95
|
kind = "descr",
|
|
50
96
|
content = "The samples used in the analysis. Each row is a sample, and columns are the meta information about the sample. This is literally the input sample information file, but the paths to the scRNA-seq and scTCR-seq data are hidden.",
|
|
@@ -59,144 +105,44 @@ add_report(
|
|
|
59
105
|
h1 = "Sample Information"
|
|
60
106
|
)
|
|
61
107
|
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
is_continuous <- FALSE
|
|
68
|
-
if (!is.null(stat$subset)) {
|
|
69
|
-
data <- mutdata %>% filter(!!parse_expr(stat$subset))
|
|
70
|
-
} else {
|
|
71
|
-
data <- mutdata
|
|
72
|
-
}
|
|
73
|
-
if (!is.null(stat$group) && !stat$na_group) {
|
|
74
|
-
data <- data %>% filter(!is.na(!!sym(stat$group)))
|
|
75
|
-
}
|
|
76
|
-
if (!is.null(stat$each) && !stat$na_each) {
|
|
77
|
-
data <- data %>% filter(!is.na(!!sym(stat$each)))
|
|
78
|
-
}
|
|
108
|
+
if (length(stats) > 0) {
|
|
109
|
+
cases <- expand_cases(stats, defaults)
|
|
110
|
+
for (name in names(cases)) {
|
|
111
|
+
log$info("- Statistic: {name}")
|
|
79
112
|
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
113
|
+
case <- cases[[name]]
|
|
114
|
+
info <- case_info(name, outdir, is_dir = FALSE, create = TRUE)
|
|
115
|
+
case <- extract_vars(case, "plot_type", "more_formats", "save_code", "section", "subset", "devpars", "descr")
|
|
83
116
|
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
}
|
|
117
|
+
plot_fn <- get_plotthis_fn(plot_type)
|
|
118
|
+
more_formats <- unique(c("png", more_formats))
|
|
87
119
|
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
count_on <- paste0("..count.", stat$on)
|
|
91
|
-
if (!is_continuous) {
|
|
92
|
-
if (!is.null(stat$each)) {
|
|
93
|
-
data <- data %>% add_count(!!group, !!sym(stat$each), name = count_on)
|
|
120
|
+
if (!is.null(subset)) {
|
|
121
|
+
case$data <- mutdata %>% dplyr::filter(!!parse_expr(subset))
|
|
94
122
|
} else {
|
|
95
|
-
data <-
|
|
123
|
+
case$data <- mutdata
|
|
96
124
|
}
|
|
97
|
-
}
|
|
98
|
-
|
|
99
|
-
if (is.null(stat$devpars)) {
|
|
100
|
-
stat$devpars <- list()
|
|
101
|
-
}
|
|
102
|
-
if (is.null(stat$devpars$width)) {
|
|
103
|
-
stat$devpars$width <- 800
|
|
104
|
-
}
|
|
105
|
-
if (is.null(stat$devpars$height)) {
|
|
106
|
-
stat$devpars$height <- 600
|
|
107
|
-
}
|
|
108
|
-
if (is.null(stat$devpars$res)) {
|
|
109
|
-
stat$devpars$res <- 100
|
|
110
|
-
}
|
|
111
125
|
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
xlab("")
|
|
123
|
-
} else if (stat$plot == "violin" ||
|
|
124
|
-
stat$plot == "violinplot" ||
|
|
125
|
-
stat$plot == "vlnplot") {
|
|
126
|
-
p <- ggplot(data, aes(x = !!group, y = !!sym(stat$on), fill=!!group)) +
|
|
127
|
-
geom_violin(position = "dodge") +
|
|
128
|
-
scale_fill_biopipen(alpha = .6) +
|
|
129
|
-
xlab("")
|
|
130
|
-
} else if (
|
|
131
|
-
(grepl("violin", stat$plot) || grepl("vln", stat$plot)) &&
|
|
132
|
-
grepl("box", stat$plot)
|
|
133
|
-
) {
|
|
134
|
-
p <- ggplot(data, aes(x = !!group, y = !!sym(stat$on), fill = !!group)) +
|
|
135
|
-
geom_violin(position = "dodge") +
|
|
136
|
-
geom_boxplot(width = 0.1, position = position_dodge(0.9), fill="white") +
|
|
137
|
-
scale_fill_biopipen(alpha = .6) +
|
|
138
|
-
xlab("")
|
|
139
|
-
} else if (stat$plot == "histogram" || stat$plot == "hist") {
|
|
140
|
-
p <- ggplot(data, aes(x = !!sym(stat$on), fill = !!group)) +
|
|
141
|
-
geom_histogram(bins = 10, position = "dodge", alpha = 0.8, color = "white") +
|
|
142
|
-
scale_fill_biopipen(alpha = .6)
|
|
143
|
-
} else if (stat$plot == "pie" || stat$plot == "piechart") {
|
|
144
|
-
if (is.null(stat$each)) {
|
|
145
|
-
data <- data %>% distinct(!!group, .keep_all = TRUE)
|
|
146
|
-
} else {
|
|
147
|
-
data <- data %>%
|
|
148
|
-
distinct(!!group, !!sym(stat$each), .keep_all = TRUE) %>%
|
|
149
|
-
mutate(!!group := factor(!!group, levels = unique(!!group))) %>%
|
|
150
|
-
group_by(!!sym(stat$each))
|
|
126
|
+
p <- do_call(gglogger::register(plot_fn, name = plot_type), case)
|
|
127
|
+
save_plot(p, info$prefix, devpars, formats = more_formats)
|
|
128
|
+
if (save_code) {
|
|
129
|
+
save_plotcode(
|
|
130
|
+
p,
|
|
131
|
+
setup = c('library(plotthis)', '', 'load("data.RData")', 'list2env(case, envir = .GlobalEnv)'),
|
|
132
|
+
prefix = info$caseprefix,
|
|
133
|
+
"case",
|
|
134
|
+
auto_data_setup = FALSE
|
|
135
|
+
)
|
|
151
136
|
}
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
color="#333333",
|
|
163
|
-
fill="#EEEEEE",
|
|
164
|
-
size=4
|
|
165
|
-
) +
|
|
166
|
-
scale_fill_biopipen(alpha = .6, name = group) +
|
|
167
|
-
ggtitle(paste0("# ", stat$on))
|
|
168
|
-
} else if (stat$plot == "bar" || stat$plot == "barplot") {
|
|
169
|
-
if (is.null(stat$each)) {
|
|
170
|
-
data <- data %>% distinct(!!group, .keep_all = TRUE)
|
|
171
|
-
} else {
|
|
172
|
-
data <- data %>% distinct(!!group, !!sym(stat$each), .keep_all = TRUE)
|
|
173
|
-
}
|
|
174
|
-
p <- ggplot(
|
|
175
|
-
data,
|
|
176
|
-
aes(x = !!group, y = !!sym(count_on), fill = !!group)) +
|
|
177
|
-
geom_bar(stat = "identity") +
|
|
178
|
-
scale_fill_biopipen(alpha = .6) +
|
|
179
|
-
ylab(paste0("# ", stat$on))
|
|
180
|
-
} else {
|
|
181
|
-
stop("Unknown plot type: ", stat$plot)
|
|
182
|
-
}
|
|
183
|
-
if (!is.null(stat$each)) {
|
|
184
|
-
p <- p + facet_wrap(vars(!!sym(stat$each)), ncol = stat$ncol)
|
|
137
|
+
|
|
138
|
+
reporter$add(
|
|
139
|
+
reporter$image(
|
|
140
|
+
info$prefix,
|
|
141
|
+
c("png", more_formats),
|
|
142
|
+
save_code,
|
|
143
|
+
kind = "table_image"
|
|
144
|
+
),
|
|
145
|
+
h1 = "Statistics", ui = "table_of_images:2"
|
|
146
|
+
)
|
|
185
147
|
}
|
|
186
|
-
print(p)
|
|
187
|
-
dev.off()
|
|
188
|
-
|
|
189
|
-
by_desc <- ifelse(is.null(stat$by), "", paste0(" by ", stat$by))
|
|
190
|
-
descr <- ifelse(
|
|
191
|
-
is_continuous,
|
|
192
|
-
paste0("The distribution of ", stat$on, by_desc),
|
|
193
|
-
paste0("The number of ", stat$on, by_desc)
|
|
194
|
-
)
|
|
195
|
-
add_report(
|
|
196
|
-
list(kind = "table_image", src = plotfile, name = name, descr = descr),
|
|
197
|
-
h1 = "Statistics",
|
|
198
|
-
ui = "table_of_images:2"
|
|
199
|
-
)
|
|
200
148
|
}
|
|
201
|
-
|
|
202
|
-
save_report(outdir)
|
|
@@ -1,8 +1,8 @@
|
|
|
1
1
|
import json
|
|
2
2
|
import rtoml
|
|
3
3
|
|
|
4
|
-
configstr = {{in.config |
|
|
5
|
-
outfile = {{out.outfile | quote}} # pyright: ignore
|
|
4
|
+
configstr: str = {{in.config | quote}} # pyright: ignore # noqa
|
|
5
|
+
outfile: str = {{out.outfile | quote}} # pyright: ignore
|
|
6
6
|
infmt = {{envs.infmt | quote}} # pyright: ignore
|
|
7
7
|
outfmt = {{envs.outfmt | quote}} # pyright: ignore
|
|
8
8
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
|
-
instr = {{in.str |
|
|
1
|
+
instr: str = {{in.str | quote}} # pyright: ignore # noqa
|
|
2
2
|
name = {{repr(in.name or envs.name)}} # pyright: ignore
|
|
3
|
-
outfile = {{out.outfile | quote}} # pyright: ignore
|
|
3
|
+
outfile: str = {{out.outfile | quote}} # pyright: ignore
|
|
4
4
|
|
|
5
5
|
with open(outfile, "wt") as fout:
|
|
6
6
|
fout.write(instr)
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
from shutil import which
|
|
3
|
+
from diot import Diot # noqa: F401
|
|
4
|
+
from biopipen.utils.misc import run_command, dict_to_cli_args
|
|
5
|
+
|
|
6
|
+
infile: str = {{in.infile | quote}} # pyright: ignore # noqa
|
|
7
|
+
outfile: str = {{out.outfile | quote}} # pyright: ignore
|
|
8
|
+
envs: dict = {{envs | repr}} # pyright: ignore
|
|
9
|
+
tool: str = envs.pop("tool", "maxit")
|
|
10
|
+
maxit: str = envs.pop("maxit", "maxit")
|
|
11
|
+
beem = envs.pop("beem", "BeEM")
|
|
12
|
+
|
|
13
|
+
if tool == "maxit":
|
|
14
|
+
maxit_found = which(maxit)
|
|
15
|
+
if not maxit_found:
|
|
16
|
+
raise ValueError(f"maxit executable not found: {maxit}")
|
|
17
|
+
|
|
18
|
+
maxit_exe = Path(maxit_found).expanduser().resolve()
|
|
19
|
+
rcsbroot = maxit_exe.parent.parent
|
|
20
|
+
envs["input"] = infile
|
|
21
|
+
envs["output"] = outfile
|
|
22
|
+
envs["o"] = 2
|
|
23
|
+
envs["log"] = Path(outfile).with_suffix(".log")
|
|
24
|
+
run_command([maxit, *dict_to_cli_args(envs, prefix="-")], fg=True, env={"RCSBROOT": rcsbroot})
|
|
25
|
+
|
|
26
|
+
else:
|
|
27
|
+
outfile: Path = Path(outfile) # type: ignore
|
|
28
|
+
envs["_"] = infile
|
|
29
|
+
envs["p"] = outfile.parent.joinpath(outfile.stem)
|
|
30
|
+
envs["outfmt"] = 3
|
|
31
|
+
args = dict_to_cli_args(envs, prefix="-", sep="=")
|
|
32
|
+
|
|
33
|
+
run_command([beem, *args], fg=True)
|
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
# """
|
|
2
|
+
# LICENSE
|
|
3
|
+
|
|
4
|
+
# GNU General Public License v2.0
|
|
5
|
+
|
|
6
|
+
# The code is based on the script from:
|
|
7
|
+
# https://github.com/kad-ecoli/pdb2fasta/blob/master/pdb2fasta.py
|
|
8
|
+
|
|
9
|
+
# The original code is licensed under GNU General Public License v2.0.
|
|
10
|
+
# The original code is modified by biopipen developers to fit the biopipen.
|
|
11
|
+
# """
|
|
12
|
+
from __future__ import annotations
|
|
13
|
+
import re
|
|
14
|
+
from collections import defaultdict
|
|
15
|
+
from pathlib import Path
|
|
16
|
+
|
|
17
|
+
infile: str = {{in.infile | quote}} # pyright: ignore # noqa: E999
|
|
18
|
+
outfile: str = {{out.outfile | quote}} # pyright: ignore
|
|
19
|
+
chains: str | list | None = {{envs.chains | repr}} # pyright: ignore
|
|
20
|
+
wrap: int = {{envs.wrap | repr}} # pyright: ignore
|
|
21
|
+
|
|
22
|
+
if isinstance(chains, str):
|
|
23
|
+
chains = [chain.strip() for chain in chains.split(",")]
|
|
24
|
+
|
|
25
|
+
aa3to1 = {
|
|
26
|
+
'ALA':'A', 'VAL':'V', 'PHE':'F', 'PRO':'P', 'MET':'M',
|
|
27
|
+
'ILE':'I', 'LEU':'L', 'ASP':'D', 'GLU':'E', 'LYS':'K',
|
|
28
|
+
'ARG':'R', 'SER':'S', 'THR':'T', 'TYR':'Y', 'HIS':'H',
|
|
29
|
+
'CYS':'C', 'ASN':'N', 'GLN':'Q', 'TRP':'W', 'GLY':'G',
|
|
30
|
+
'MSE':'M',
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
ca_pattern = re.compile(
|
|
34
|
+
r"^ATOM\s{2,6}\d{1,5}\s{2}CA\s[\sA]([A-Z]{3})\s([\s\w])|^HETATM\s{0,4}\d{1,5}\s{2}CA\s[\sA](MSE)\s([\s\w])" # noqa: W605
|
|
35
|
+
)
|
|
36
|
+
|
|
37
|
+
filename = Path(infile).stem
|
|
38
|
+
chain_dict = defaultdict(str)
|
|
39
|
+
|
|
40
|
+
with open(infile, 'r') as fp:
|
|
41
|
+
for line in fp:
|
|
42
|
+
if line.startswith("ENDMDL"):
|
|
43
|
+
break
|
|
44
|
+
|
|
45
|
+
match_list = ca_pattern.findall(line)
|
|
46
|
+
if match_list:
|
|
47
|
+
resn = match_list[0][0] + match_list[0][2]
|
|
48
|
+
chain = match_list[0][1] + match_list[0][3]
|
|
49
|
+
if chains is None or chain in chains:
|
|
50
|
+
chain_dict[chain] += aa3to1[resn]
|
|
51
|
+
|
|
52
|
+
with open(outfile, 'w') as fp:
|
|
53
|
+
for chain in chain_dict:
|
|
54
|
+
fp.write(f">{filename}:{chain}\n")
|
|
55
|
+
sequence = chain_dict[chain]
|
|
56
|
+
if wrap > 0:
|
|
57
|
+
for i in range(0, len(sequence), 80):
|
|
58
|
+
fp.write(sequence[i:i+80] + "\n")
|
|
59
|
+
else:
|
|
60
|
+
fp.write(sequence + "\n")
|
|
@@ -2,15 +2,15 @@ import json
|
|
|
2
2
|
import logging
|
|
3
3
|
import sys
|
|
4
4
|
from pathlib import Path
|
|
5
|
-
from prodigy_prot.predict_IC import (
|
|
5
|
+
from prodigy_prot.predict_IC import ( # type: ignore
|
|
6
6
|
Prodigy,
|
|
7
7
|
check_path,
|
|
8
8
|
parse_structure,
|
|
9
9
|
)
|
|
10
10
|
|
|
11
|
-
infile = {{in.infile |
|
|
12
|
-
outfile = {{out.outfile |
|
|
13
|
-
outdir = {{out.outdir |
|
|
11
|
+
infile: str = {{in.infile | quote}} # pyright: ignore # noqa
|
|
12
|
+
outfile: str = {{out.outfile | quote}} # pyright: ignore
|
|
13
|
+
outdir: str = {{out.outdir | quote}} # pyright: ignore
|
|
14
14
|
distance_cutoff = {{envs.distance_cutoff | float}} # pyright: ignore
|
|
15
15
|
acc_threshold = {{envs.acc_threshold | float}} # pyright: ignore
|
|
16
16
|
temperature = {{envs.temperature | float}} # pyright: ignore
|
|
@@ -0,0 +1,178 @@
|
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
from shutil import which
|
|
3
|
+
from diot import Diot # noqa: F401
|
|
4
|
+
from biopipen.utils.misc import run_command, dict_to_cli_args
|
|
5
|
+
|
|
6
|
+
infile1: str = {{in.infile1 | quote}} # pyright: ignore # noqa
|
|
7
|
+
infile2: str = {{in.infile2 | quote}} # pyright: ignore # noqa
|
|
8
|
+
outfile: str = {{out.outfile | quote}} # pyright: ignore # noqa
|
|
9
|
+
outdir: str = {{job.outdir | quote}} # pyright: ignore # noqa
|
|
10
|
+
envs: dict = {{envs | repr}} # pyright: ignore # noqa
|
|
11
|
+
conv_tool = envs.pop("conv_tool", "maxit")
|
|
12
|
+
maxit = envs.pop("maxit", "maxit")
|
|
13
|
+
beem = envs.pop("beem", "BeEM")
|
|
14
|
+
ca_only = envs.pop("ca_only", False)
|
|
15
|
+
# aa20_only = envs.pop("aa20_only", False)
|
|
16
|
+
duel = envs.pop("duel", "keep")
|
|
17
|
+
calculate_rmsd = envs.pop("calculate_rmsd", "calculate_rmsd")
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def cif_to_pdb(cif_file, pdb_file:Path):
|
|
21
|
+
if conv_tool == "maxit":
|
|
22
|
+
maxit_bin = Path(which(maxit)).resolve()
|
|
23
|
+
rcsbroot = Path(maxit_bin).parent.parent
|
|
24
|
+
args = {"input": cif_file, "output": pdb_file, "o": 2, "log": pdb_file.with_suffix(".log")}
|
|
25
|
+
run_command([maxit, *dict_to_cli_args(args, prefix="-")], fg=True, env={"RCSBROOT": rcsbroot})
|
|
26
|
+
else:
|
|
27
|
+
args = {"_": cif_file, "p": pdb_file.parent.joinpath(pdb_file.stem)}
|
|
28
|
+
args = dict_to_cli_args(args, prefix="-", sep="=")
|
|
29
|
+
run_command([beem, *args], fg=True)
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def pdb_to_ca_pdb(pdb_file: Path, ca_pdb_file: Path):
|
|
33
|
+
"""Extract C-alpha atoms from a PDB file and still keep the original order and metadata."""
|
|
34
|
+
with open(pdb_file, "r") as f, open(ca_pdb_file, "w") as fw:
|
|
35
|
+
for line in f:
|
|
36
|
+
if line.startswith("ATOM") and line[12:16].strip() == "CA":
|
|
37
|
+
fw.write(line)
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
# def pdb_to_aa20_pdb(pdb_file: Path, aa20_pdb_file: Path):
|
|
41
|
+
# """Extract the 20 amino acids from a PDB file and still keep the original order and metadata."""
|
|
42
|
+
# with open(pdb_file, "r") as f, open(aa20_pdb_file, "w") as fw:
|
|
43
|
+
# for line in f:
|
|
44
|
+
# if line.startswith("ATOM") and line[17:20].strip() in (
|
|
45
|
+
# "ALA", "ARG", "ASN", "ASP", "CYS", "GLN", "GLU", "GLY",
|
|
46
|
+
# "HIS", "ILE", "LEU", "LYS", "MET", "PHE", "PRO", "SER",
|
|
47
|
+
# "THR", "TRP", "TYR", "VAL",
|
|
48
|
+
# ):
|
|
49
|
+
# fw.write(line)
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def deduel_pdb(pdb_file: Path, deduel_pdb_file: Path):
|
|
53
|
+
"""Remove/Handle the duel atoms in a PDB file."""
|
|
54
|
+
def is_duel(atom1, atom2):
|
|
55
|
+
# 1 2
|
|
56
|
+
# 01234567890123456789012345
|
|
57
|
+
# ATOM 913 CA ATYR A 113
|
|
58
|
+
# ATOM 914 CA BTYR A 113
|
|
59
|
+
# The key should be "ATOM|CA |TYR| A| 113"
|
|
60
|
+
return (
|
|
61
|
+
atom1[:4] == atom2[:4] and
|
|
62
|
+
atom1[12:16] == atom2[12:16] and
|
|
63
|
+
atom1[17:20] == atom2[17:20] and
|
|
64
|
+
atom1[21] == atom2[21] and
|
|
65
|
+
atom1[22:26] == atom2[22:26] and
|
|
66
|
+
atom1[16] != atom2[16]
|
|
67
|
+
)
|
|
68
|
+
|
|
69
|
+
def clean_atom(atom):
|
|
70
|
+
return atom[:16] + " " + atom[17:]
|
|
71
|
+
|
|
72
|
+
last_atom = ""
|
|
73
|
+
with open(pdb_file, "r") as f, open(deduel_pdb_file, "w") as fw:
|
|
74
|
+
for line in f:
|
|
75
|
+
if not line.startswith("ATOM"):
|
|
76
|
+
fw.write(line)
|
|
77
|
+
continue
|
|
78
|
+
if not is_duel(last_atom, line):
|
|
79
|
+
if last_atom:
|
|
80
|
+
fw.write(clean_atom(last_atom))
|
|
81
|
+
last_atom = line
|
|
82
|
+
# is duel
|
|
83
|
+
elif duel == "keep":
|
|
84
|
+
fw.write(clean_atom(last_atom))
|
|
85
|
+
fw.write(clean_atom(line))
|
|
86
|
+
last_atom = ""
|
|
87
|
+
elif duel == "keep_first":
|
|
88
|
+
fw.write(clean_atom(last_atom))
|
|
89
|
+
last_atom = ""
|
|
90
|
+
elif duel == "keep_last":
|
|
91
|
+
fw.write(clean_atom(line))
|
|
92
|
+
last_atom = ""
|
|
93
|
+
elif duel == "average":
|
|
94
|
+
# Average the coordinates
|
|
95
|
+
x1 = float(last_atom[30:38])
|
|
96
|
+
y1 = float(last_atom[38:46])
|
|
97
|
+
z1 = float(last_atom[46:54])
|
|
98
|
+
x2 = float(line[30:38])
|
|
99
|
+
y2 = float(line[38:46])
|
|
100
|
+
z2 = float(line[46:54])
|
|
101
|
+
x = (x1 + x2) / 2.0
|
|
102
|
+
y = (y1 + y2) / 2.0
|
|
103
|
+
z = (z1 + z2) / 2.0
|
|
104
|
+
fw.write(clean_atom(last_atom[:30] + f"{x:8.3f}{y:8.3f}{z:8.3f}" + last_atom[54:]))
|
|
105
|
+
last_atom = ""
|
|
106
|
+
|
|
107
|
+
if last_atom:
|
|
108
|
+
fw.write(last_atom)
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
def index_of(lst, item) -> int:
|
|
112
|
+
try:
|
|
113
|
+
return lst.index(item)
|
|
114
|
+
except ValueError:
|
|
115
|
+
return -1
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
if infile1.endswith(".cif"):
|
|
119
|
+
pdb1 = Path(outdir) / f"{Path(infile1).stem}.pdb"
|
|
120
|
+
cif_to_pdb(infile1, pdb1)
|
|
121
|
+
infile1 = pdb1 # type: ignore
|
|
122
|
+
|
|
123
|
+
if infile2.endswith(".cif"):
|
|
124
|
+
pdb2 = Path(outdir) / f"{Path(infile2).stem}.pdb"
|
|
125
|
+
cif_to_pdb(infile2, pdb2)
|
|
126
|
+
infile2 = pdb2 # type: ignore
|
|
127
|
+
|
|
128
|
+
if ca_only:
|
|
129
|
+
ca_pdb1 = Path(outdir) / f"{Path(infile1).stem}.ca.pdb"
|
|
130
|
+
pdb_to_ca_pdb(infile1, ca_pdb1) # type: ignore
|
|
131
|
+
infile1 = ca_pdb1 # type: ignore
|
|
132
|
+
|
|
133
|
+
ca_pdb2 = Path(outdir) / f"{Path(infile2).stem}.ca.pdb"
|
|
134
|
+
pdb_to_ca_pdb(infile2, ca_pdb2) # type: ignore
|
|
135
|
+
infile2 = ca_pdb2 # type: ignore
|
|
136
|
+
|
|
137
|
+
# if aa20_only:
|
|
138
|
+
# aa20_pdb1 = Path(outdir) / f"{Path(infile1).stem}.aa20.pdb"
|
|
139
|
+
# pdb_to_aa20_pdb(infile1, aa20_pdb1) # type: ignore
|
|
140
|
+
# infile1 = aa20_pdb1 # type: ignore
|
|
141
|
+
|
|
142
|
+
# aa20_pdb2 = Path(outdir) / f"{Path(infile2).stem}.aa20.pdb"
|
|
143
|
+
# pdb_to_aa20_pdb(infile2, aa20_pdb2) # type: ignore
|
|
144
|
+
# infile2 = aa20_pdb2 # type: ignore
|
|
145
|
+
|
|
146
|
+
if duel != "keep":
|
|
147
|
+
deduel_pdb1 = Path(outdir) / f"{Path(infile1).stem}.deduel.pdb"
|
|
148
|
+
deduel_pdb(infile1, deduel_pdb1) # type: ignore
|
|
149
|
+
infile1 = deduel_pdb1 # type: ignore
|
|
150
|
+
|
|
151
|
+
deduel_pdb2 = Path(outdir) / f"{Path(infile2).stem}.deduel.pdb"
|
|
152
|
+
deduel_pdb(infile2, deduel_pdb2) # type: ignore
|
|
153
|
+
infile2 = deduel_pdb2 # type: ignore
|
|
154
|
+
|
|
155
|
+
envs["_"] = [infile1, infile2]
|
|
156
|
+
envs = dict_to_cli_args(envs, dashify=True)
|
|
157
|
+
|
|
158
|
+
idx_ur = index_of(envs, "--ur")
|
|
159
|
+
if idx_ur != -1:
|
|
160
|
+
envs[idx_ur] = "-ur"
|
|
161
|
+
|
|
162
|
+
idx_urks = index_of(envs, "--urks")
|
|
163
|
+
if idx_urks != -1:
|
|
164
|
+
envs[idx_urks] = "-urks"
|
|
165
|
+
|
|
166
|
+
idx_nh = index_of(envs, "--nh")
|
|
167
|
+
if idx_nh != -1:
|
|
168
|
+
envs[idx_nh] = "-nh"
|
|
169
|
+
|
|
170
|
+
out: str = run_command([calculate_rmsd, *envs], stdout="return") # type: ignore
|
|
171
|
+
out = out.strip()
|
|
172
|
+
|
|
173
|
+
try:
|
|
174
|
+
float(out)
|
|
175
|
+
except (ValueError, TypeError):
|
|
176
|
+
raise ValueError(out)
|
|
177
|
+
|
|
178
|
+
Path(outfile).write_text(out)
|
|
@@ -5,20 +5,20 @@ import re
|
|
|
5
5
|
from pathlib import PosixPath # noqa: F401
|
|
6
6
|
from biopipen.utils.misc import run_command, dict_to_cli_args, logger
|
|
7
7
|
|
|
8
|
-
motiffile = {{in.motiffile |
|
|
9
|
-
seqfile = {{in.seqfile |
|
|
10
|
-
outdir = {{out.outdir |
|
|
8
|
+
motiffile: str = {{in.motiffile | quote}} # pyright: ignore # noqa: #999
|
|
9
|
+
seqfile: str = {{in.seqfile | quote}} # pyright: ignore
|
|
10
|
+
outdir: str = {{out.outdir | quote}} # pyright: ignore
|
|
11
11
|
|
|
12
12
|
tool = {{envs.tool | repr}} # pyright: ignore
|
|
13
13
|
fimo = {{envs.fimo | repr}} # pyright: ignore
|
|
14
|
-
motif_col = {{envs.motif_col | repr}} # pyright: ignore
|
|
15
|
-
regulator_col = {{envs.regulator_col | repr}} # pyright: ignore
|
|
14
|
+
motif_col: str | int = {{envs.motif_col | repr}} # pyright: ignore
|
|
15
|
+
regulator_col: str | int = {{envs.regulator_col | repr}} # pyright: ignore
|
|
16
16
|
notfound = {{envs.notfound | repr}} # pyright: ignore
|
|
17
|
-
motifdb = {{envs.motifdb | repr}} # pyright: ignore
|
|
17
|
+
motifdb: str | None = {{envs.motifdb | repr}} # pyright: ignore
|
|
18
18
|
cutoff = {{envs.cutoff | repr}} # pyright: ignore
|
|
19
19
|
q = {{envs.q | repr}} # pyright: ignore
|
|
20
20
|
q_cutoff = {{envs.q_cutoff | repr}} # pyright: ignore
|
|
21
|
-
args = {{envs.args | dict | repr}} # pyright: ignore
|
|
21
|
+
args: dict = {{envs.args | dict | repr}} # pyright: ignore
|
|
22
22
|
|
|
23
23
|
# Check if the tool is supported
|
|
24
24
|
if tool != "fimo":
|
|
@@ -41,7 +41,7 @@ if isinstance(motif_col, str) or isinstance(regulator_col, str):
|
|
|
41
41
|
with open(motiffile, "r") as f:
|
|
42
42
|
header = f.readline().strip().split("\t")
|
|
43
43
|
if isinstance(motif_col, str):
|
|
44
|
-
motif_col = header.index(motif_col) + 1
|
|
44
|
+
motif_col: int = header.index(motif_col) + 1
|
|
45
45
|
if isinstance(regulator_col, str):
|
|
46
46
|
regulator_col = header.index(regulator_col) + 1
|
|
47
47
|
if isinstance(motif_col, int):
|