biopipen 0.32.3__py3-none-any.whl → 0.33.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of biopipen might be problematic. Click here for more details.

Files changed (117) hide show
  1. biopipen/__init__.py +1 -1
  2. biopipen/core/config.toml +6 -0
  3. biopipen/core/filters.py +35 -23
  4. biopipen/core/testing.py +6 -1
  5. biopipen/ns/bam.py +39 -0
  6. biopipen/ns/cellranger.py +5 -0
  7. biopipen/ns/cellranger_pipeline.py +2 -2
  8. biopipen/ns/cnvkit_pipeline.py +4 -1
  9. biopipen/ns/delim.py +33 -27
  10. biopipen/ns/protein.py +99 -0
  11. biopipen/ns/scrna.py +411 -250
  12. biopipen/ns/snp.py +16 -3
  13. biopipen/ns/tcr.py +125 -1
  14. biopipen/ns/vcf.py +34 -0
  15. biopipen/ns/web.py +5 -1
  16. biopipen/reports/scrna/SeuratClusterStats.svelte +1 -1
  17. biopipen/reports/scrna/SeuratMap2Ref.svelte +15 -2
  18. biopipen/reports/tcr/ClonalStats.svelte +15 -0
  19. biopipen/reports/utils/misc.liq +20 -7
  20. biopipen/scripts/bam/BamMerge.py +2 -2
  21. biopipen/scripts/bam/BamSampling.py +4 -4
  22. biopipen/scripts/bam/BamSort.py +141 -0
  23. biopipen/scripts/bam/BamSplitChroms.py +10 -10
  24. biopipen/scripts/bam/BamSubsetByBed.py +3 -3
  25. biopipen/scripts/bam/CNVpytor.py +10 -10
  26. biopipen/scripts/bam/ControlFREEC.py +11 -11
  27. biopipen/scripts/bed/Bed2Vcf.py +5 -5
  28. biopipen/scripts/bed/BedConsensus.py +5 -5
  29. biopipen/scripts/bed/BedLiftOver.sh +6 -4
  30. biopipen/scripts/bed/BedtoolsIntersect.py +4 -4
  31. biopipen/scripts/bed/BedtoolsMakeWindows.py +3 -3
  32. biopipen/scripts/bed/BedtoolsMerge.py +4 -4
  33. biopipen/scripts/cellranger/CellRangerCount.py +20 -9
  34. biopipen/scripts/cellranger/CellRangerSummary.R +20 -29
  35. biopipen/scripts/cellranger/CellRangerVdj.py +8 -8
  36. biopipen/scripts/cnvkit/CNVkitAccess.py +6 -6
  37. biopipen/scripts/cnvkit/CNVkitAutobin.py +25 -18
  38. biopipen/scripts/cnvkit/CNVkitBatch.py +5 -5
  39. biopipen/scripts/cnvkit/CNVkitCall.py +3 -3
  40. biopipen/scripts/cnvkit/CNVkitCoverage.py +2 -2
  41. biopipen/scripts/cnvkit/CNVkitDiagram.py +5 -5
  42. biopipen/scripts/cnvkit/CNVkitFix.py +3 -3
  43. biopipen/scripts/cnvkit/CNVkitGuessBaits.py +9 -5
  44. biopipen/scripts/cnvkit/CNVkitHeatmap.py +4 -4
  45. biopipen/scripts/cnvkit/CNVkitReference.py +2 -2
  46. biopipen/scripts/cnvkit/CNVkitScatter.py +5 -5
  47. biopipen/scripts/cnvkit/CNVkitSegment.py +5 -5
  48. biopipen/scripts/cnvkit/guess_baits.py +166 -93
  49. biopipen/scripts/delim/SampleInfo.R +85 -148
  50. biopipen/scripts/misc/Config2File.py +2 -2
  51. biopipen/scripts/misc/Str2File.py +2 -2
  52. biopipen/scripts/protein/MMCIF2PDB.py +33 -0
  53. biopipen/scripts/protein/PDB2Fasta.py +60 -0
  54. biopipen/scripts/protein/Prodigy.py +4 -4
  55. biopipen/scripts/protein/RMSD.py +178 -0
  56. biopipen/scripts/regulatory/MotifScan.py +8 -8
  57. biopipen/scripts/scrna/CellCellCommunication.py +59 -22
  58. biopipen/scripts/scrna/MarkersFinder.R +273 -654
  59. biopipen/scripts/scrna/RadarPlots.R +73 -53
  60. biopipen/scripts/scrna/SCP-plot.R +15202 -0
  61. biopipen/scripts/scrna/ScVelo.py +0 -0
  62. biopipen/scripts/scrna/SeuratClusterStats-clustree.R +23 -31
  63. biopipen/scripts/scrna/SeuratClusterStats-dimplots.R +26 -54
  64. biopipen/scripts/scrna/SeuratClusterStats-features.R +85 -403
  65. biopipen/scripts/scrna/SeuratClusterStats-ngenes.R +32 -17
  66. biopipen/scripts/scrna/SeuratClusterStats-stats.R +45 -239
  67. biopipen/scripts/scrna/SeuratClusterStats.R +13 -19
  68. biopipen/scripts/scrna/SeuratMap2Ref.R +16 -12
  69. biopipen/scripts/scrna/SeuratPreparing.R +138 -81
  70. biopipen/scripts/scrna/SlingShot.R +71 -0
  71. biopipen/scripts/scrna/celltypist-wrapper.py +7 -6
  72. biopipen/scripts/snp/Plink2GTMat.py +26 -11
  73. biopipen/scripts/snp/PlinkFilter.py +7 -7
  74. biopipen/scripts/snp/PlinkFromVcf.py +8 -5
  75. biopipen/scripts/snp/PlinkSimulation.py +4 -4
  76. biopipen/scripts/snp/PlinkUpdateName.py +4 -4
  77. biopipen/scripts/stats/ChowTest.R +48 -22
  78. biopipen/scripts/tcgamaf/Maf2Vcf.py +2 -2
  79. biopipen/scripts/tcgamaf/MafAddChr.py +2 -2
  80. biopipen/scripts/tcr/ClonalStats.R +484 -0
  81. biopipen/scripts/tcr/ScRepLoading.R +127 -0
  82. biopipen/scripts/tcr/TCRDock.py +10 -6
  83. biopipen/scripts/tcr/vdjtools-patch.sh +1 -1
  84. biopipen/scripts/vcf/BcftoolsAnnotate.py +8 -8
  85. biopipen/scripts/vcf/BcftoolsFilter.py +3 -3
  86. biopipen/scripts/vcf/BcftoolsMerge.py +31 -0
  87. biopipen/scripts/vcf/BcftoolsSort.py +4 -4
  88. biopipen/scripts/vcf/BcftoolsView.py +5 -5
  89. biopipen/scripts/vcf/Vcf2Bed.py +2 -2
  90. biopipen/scripts/vcf/VcfAnno.py +11 -11
  91. biopipen/scripts/vcf/VcfDownSample.sh +22 -10
  92. biopipen/scripts/vcf/VcfFilter.py +5 -5
  93. biopipen/scripts/vcf/VcfFix.py +7 -7
  94. biopipen/scripts/vcf/VcfFix_utils.py +12 -3
  95. biopipen/scripts/vcf/VcfIndex.py +3 -3
  96. biopipen/scripts/vcf/VcfIntersect.py +3 -3
  97. biopipen/scripts/vcf/VcfLiftOver.sh +5 -0
  98. biopipen/scripts/vcf/VcfSplitSamples.py +4 -4
  99. biopipen/scripts/vcf/bcftools_utils.py +3 -3
  100. biopipen/scripts/web/Download.py +8 -4
  101. biopipen/scripts/web/DownloadList.py +5 -5
  102. biopipen/scripts/web/GCloudStorageDownloadBucket.py +5 -5
  103. biopipen/scripts/web/GCloudStorageDownloadFile.py +3 -3
  104. biopipen/scripts/web/gcloud_common.py +1 -1
  105. biopipen/utils/gsea.R +75 -35
  106. biopipen/utils/misc.R +205 -7
  107. biopipen/utils/misc.py +17 -8
  108. biopipen/utils/reference.py +11 -11
  109. biopipen/utils/repr.R +146 -0
  110. biopipen/utils/vcf.py +1 -1
  111. {biopipen-0.32.3.dist-info → biopipen-0.33.0.dist-info}/METADATA +8 -8
  112. {biopipen-0.32.3.dist-info → biopipen-0.33.0.dist-info}/RECORD +114 -105
  113. {biopipen-0.32.3.dist-info → biopipen-0.33.0.dist-info}/WHEEL +1 -1
  114. biopipen/scripts/scrna/SeuratClusterStats-hists.R +0 -144
  115. biopipen/scripts/scrna/SeuratPreparing-common.R +0 -467
  116. biopipen/scripts/scrna/SeuratPreparing-doublet_detection.R +0 -204
  117. {biopipen-0.32.3.dist-info → biopipen-0.33.0.dist-info}/entry_points.txt +0 -0
@@ -1,14 +1,11 @@
1
- {{ biopipen_dir | joinpaths: "utils", "misc.R" | source_r }}
2
- {{ biopipen_dir | joinpaths: "utils", "mutate_helpers.R" | source_r }}
3
-
4
1
  library(rlang)
5
2
  library(dplyr)
6
- library(ggplot2)
7
- library(ggprism)
8
- library(ggrepel)
3
+ library(biopipen.utils)
4
+ library(plotthis)
9
5
 
10
6
  infile <- {{in.infile | r}}
11
7
  outfile <- {{out.outfile | r}}
8
+ joboutdir <- {{job.outdir | r}}
12
9
  sep <- {{envs.sep | r}}
13
10
  mutaters <- {{envs.mutaters | r}}
14
11
  save_mutated <- {{envs.save_mutated | r}}
@@ -16,6 +13,9 @@ defaults <- {{envs.defaults | r}}
16
13
  stats <- {{envs.stats | r}}
17
14
  exclude_cols <- {{envs.exclude_cols | r}}
18
15
 
16
+ log <- get_logger()
17
+ reporter <- get_reporter()
18
+
19
19
  if (is.null(exclude_cols)) {
20
20
  exclude_cols <- c()
21
21
  } else {
@@ -29,6 +29,50 @@ if (colnames(indata)[1] == "row.names") {
29
29
  stop("Wrong number of column names. Do you have the right `sep`?")
30
30
  }
31
31
 
32
+ #' Get plotthis function from plot_type
33
+ #'
34
+ #' @param plot_type The plot type
35
+ #' @param gglogger_register Register the plotthis function to gglogger
36
+ #' @param return_name Return the name of the function instead of the function
37
+ #' @return The plotthis function
38
+ #' @export
39
+ get_plotthis_fn <- function(plot_type, gglogger_register = TRUE, return_name = FALSE) {
40
+ fn_name <- switch(plot_type,
41
+ hist = "Histogram",
42
+ histo = "Histogram",
43
+ histogram = "Histogram",
44
+ featuredim = "FeatureDimPlot",
45
+ splitbar = "SplitBarPlot",
46
+ enrichmap = "EnrichMap",
47
+ enrichnet = "EnrichNetwork",
48
+ enrichnetwork = "EnrichNetwork",
49
+ gsea = "GSEAPlot",
50
+ gseasummary = "GSEASummaryPlot",
51
+ gseasum = "GSEASummaryPlot",
52
+ heatmap = "Heatmap",
53
+ network = "Network",
54
+ pie = "PieChart",
55
+ wordcloud = "WordCloudPlot",
56
+ venn = "VennDiagram",
57
+ paste0(tools::toTitleCase(plot_type), "Plot")
58
+ )
59
+ if (return_name) {
60
+ return(fn_name)
61
+ }
62
+ fn <- tryCatch({
63
+ utils::getFromNamespace(fn_name, "plotthis")
64
+ }, error = function(e) {
65
+ stop("Unknown plot type: ", plot_type)
66
+ })
67
+
68
+ if (gglogger_register) {
69
+ gglogger::register(fn, fn_name)
70
+ } else {
71
+ fn
72
+ }
73
+ }
74
+
75
+ log$info("Applying mutaters to the data if any ...")
32
76
  if (!is.null(mutaters) && length(mutaters) > 0) {
33
77
  mutdata <- indata %>%
34
78
  mutate(!!!lapply(mutaters, parse_expr))
@@ -44,7 +88,9 @@ write.table(
44
88
  col.names = TRUE,
45
89
  quote = FALSE
46
90
  )
47
- add_report(
91
+
92
+
93
+ reporter$add(
48
94
  list(
49
95
  kind = "descr",
50
96
  content = "The samples used in the analysis. Each row is a sample, and columns are the meta information about the sample. This is literally the input sample information file, but the paths to the scRNA-seq and scTCR-seq data are hidden.",
@@ -59,153 +105,44 @@ add_report(
59
105
  h1 = "Sample Information"
60
106
  )
61
107
 
62
- theme_set(theme_prism())
63
- for (name in names(stats)) {
64
- stat <- list_update(defaults, stats[[name]])
65
- plotfile <- file.path(outdir, paste0(name, ".png"))
66
-
67
- is_continuous <- FALSE
68
- if (!is.null(stat$subset)) {
69
- data <- mutdata %>% filter(!!parse_expr(stat$subset))
70
- } else {
71
- data <- mutdata
72
- }
73
- if (!is.null(stat$group) && !stat$na_group) {
74
- data <- data %>% filter(!is.na(!!sym(stat$group)))
75
- }
76
- if (!is.null(stat$each) && !stat$na_each) {
77
- data <- data %>% filter(!is.na(!!sym(stat$each)))
78
- }
108
+ if (length(stats) > 0) {
109
+ cases <- expand_cases(stats, defaults)
110
+ for (name in names(cases)) {
111
+ log$info("- Statistic: {name}")
79
112
 
80
- if (is.numeric(data[[stat$on]])) {
81
- is_continuous <- TRUE
82
- }
113
+ case <- cases[[name]]
114
+ info <- case_info(name, outdir, is_dir = FALSE, create = TRUE)
115
+ case <- extract_vars(case, "plot_type", "more_formats", "save_code", "section", "subset", "devpars", "descr")
83
116
 
84
- if (is.null(stat$plot)) {
85
- stat$plot <- if (is_continuous) "boxplot" else "pie"
86
- }
117
+ plot_fn <- get_plotthis_fn(plot_type)
118
+ more_formats <- unique(c("png", more_formats))
87
119
 
88
- data$..group <- "All"
89
- group <- if (is.null(stat$group)) sym("..group") else sym(stat$group)
90
- count_on <- paste0("..count.", stat$on)
91
- if (!is_continuous) {
92
- if (!is.null(stat$each)) {
93
- data <- data %>% add_count(!!group, !!sym(stat$each), name = count_on)
120
+ if (!is.null(subset)) {
121
+ case$data <- mutdata %>% dplyr::filter(!!parse_expr(subset))
94
122
  } else {
95
- data <- data %>% add_count(!!group, name = count_on)
123
+ case$data <- mutdata
96
124
  }
97
- }
98
125
 
99
- if (is.null(stat$devpars)) {
100
- stat$devpars <- list()
101
- }
102
- if (is.null(stat$devpars$width)) {
103
- stat$devpars$width <- 800
104
- }
105
- if (is.null(stat$devpars$height)) {
106
- stat$devpars$height <- 600
107
- }
108
- if (is.null(stat$devpars$res)) {
109
- stat$devpars$res <- 100
110
- }
111
-
112
- png(
113
- plotfile,
114
- width = stat$devpars$width,
115
- height = stat$devpars$height,
116
- res = stat$devpars$res
117
- )
118
- if (stat$plot == "boxplot" || stat$plot == "box") {
119
- p <- ggplot(data, aes(x=!!group, y=!!sym(stat$on), fill=!!group)) +
120
- geom_boxplot(position = "dodge") +
121
- scale_fill_biopipen(alpha = .6) +
122
- xlab("")
123
- } else if (stat$plot == "violin" ||
124
- stat$plot == "violinplot" ||
125
- stat$plot == "vlnplot") {
126
- p <- ggplot(data, aes(x = !!group, y = !!sym(stat$on), fill=!!group)) +
127
- geom_violin(position = "dodge") +
128
- scale_fill_biopipen(alpha = .6) +
129
- xlab("")
130
- } else if (
131
- (grepl("violin", stat$plot) || grepl("vln", stat$plot)) &&
132
- grepl("box", stat$plot)
133
- ) {
134
- p <- ggplot(data, aes(x = !!group, y = !!sym(stat$on), fill = !!group)) +
135
- geom_violin(position = "dodge") +
136
- geom_boxplot(width = 0.1, position = position_dodge(0.9), fill="white") +
137
- scale_fill_biopipen(alpha = .6) +
138
- xlab("")
139
- } else if (stat$plot == "histogram" || stat$plot == "hist") {
140
- p <- ggplot(data, aes(x = !!sym(stat$on), fill = !!group)) +
141
- geom_histogram(bins = 10, position = "dodge", alpha = 0.8, color = "white") +
142
- scale_fill_biopipen(alpha = .6)
143
- } else if (stat$plot == "pie" || stat$plot == "piechart") {
144
- if (is.null(stat$each)) {
145
- data <- data %>% distinct(!!group, .keep_all = TRUE)
146
- } else {
147
- data <- data %>%
148
- distinct(!!group, !!sym(stat$each), .keep_all = TRUE) %>%
149
- mutate(!!group := factor(!!group, levels = unique(!!group))) %>%
150
- group_by(!!sym(stat$each))
126
+ p <- do_call(gglogger::register(plot_fn, name = plot_type), case)
127
+ save_plot(p, info$prefix, devpars, formats = more_formats)
128
+ if (save_code) {
129
+ save_plotcode(
130
+ p,
131
+ setup = c('library(plotthis)', '', 'load("data.RData")', 'list2env(case, envir = .GlobalEnv)'),
132
+ prefix = info$caseprefix,
133
+ "case",
134
+ auto_data_setup = FALSE
135
+ )
151
136
  }
152
- p <- ggplot(
153
- data %>% mutate(.size = sum(!!sym(count_on))),
154
- aes(x = sqrt(.size) / 2, width = sqrt(.size), y = !!sym(count_on), fill = !!group, label = !!sym(count_on))
155
- ) +
156
- geom_bar(stat="identity", color="white", position = position_fill(reverse = TRUE)) +
157
- coord_polar("y", start = 0) +
158
- theme_void() +
159
- theme(plot.title = element_text(hjust = 0.5)) +
160
- geom_label_repel(
161
- position = position_fill(reverse = TRUE,vjust = .5),
162
- color="#333333",
163
- fill="#EEEEEE",
164
- size=4
165
- ) +
166
- scale_fill_biopipen(alpha = .6, name = group) +
167
- ggtitle(paste0("# ", stat$on))
168
- } else if (stat$plot == "bar" || stat$plot == "barplot") {
169
- if (is.null(stat$each)) {
170
- data <- data %>% distinct(!!group, .keep_all = TRUE)
171
- } else {
172
- data <- data %>% distinct(!!group, !!sym(stat$each), .keep_all = TRUE)
173
- }
174
- p <- ggplot(
175
- data,
176
- aes(x = !!group, y = !!sym(count_on), fill = !!group)) +
177
- geom_bar(stat = "identity") +
178
- scale_fill_biopipen(alpha = .6) +
179
- ylab(paste0("# ", stat$on))
180
- } else {
181
- stop("Unknown plot type: ", stat$plot)
182
- }
183
- if (!is.null(stat$each)) {
184
- p <- p + facet_wrap(vars(!!sym(stat$each)), ncol = stat$ncol)
137
+
138
+ reporter$add(
139
+ reporter$image(
140
+ info$prefix,
141
+ c("png", more_formats),
142
+ save_code,
143
+ kind = "table_image"
144
+ ),
145
+ h1 = "Statistics", ui = "table_of_images:2"
146
+ )
185
147
  }
186
- print(p)
187
- dev.off()
188
-
189
- plotfile_pdf <- file.path(outdir, paste0(name, ".pdf"))
190
- pdf(
191
- plotfile_pdf,
192
- width = stat$devpars$width / stat$devpars$res,
193
- height = stat$devpars$height / stat$devpars$res
194
- )
195
- print(p)
196
- dev.off()
197
-
198
- by_desc <- ifelse(is.null(stat$by), "", paste0(" by ", stat$by))
199
- descr <- ifelse(
200
- is_continuous,
201
- paste0("The distribution of ", stat$on, by_desc),
202
- paste0("The number of ", stat$on, by_desc)
203
- )
204
- add_report(
205
- list(kind = "table_image", src = plotfile, name = name, descr = descr, download = plotfile_pdf),
206
- h1 = "Statistics",
207
- ui = "table_of_images:2"
208
- )
209
148
  }
210
-
211
- save_report(outdir)
@@ -1,8 +1,8 @@
1
1
  import json
2
2
  import rtoml
3
3
 
4
- configstr = {{in.config | repr}} # pyright: ignore
5
- outfile = {{out.outfile | quote}} # pyright: ignore
4
+ configstr: str = {{in.config | quote}} # pyright: ignore # noqa
5
+ outfile: str = {{out.outfile | quote}} # pyright: ignore
6
6
  infmt = {{envs.infmt | quote}} # pyright: ignore
7
7
  outfmt = {{envs.outfmt | quote}} # pyright: ignore
8
8
 
@@ -1,6 +1,6 @@
1
- instr = {{in.str | repr}} # pyright: ignore
1
+ instr: str = {{in.str | quote}} # pyright: ignore # noqa
2
2
  name = {{repr(in.name or envs.name)}} # pyright: ignore
3
- outfile = {{out.outfile | quote}} # pyright: ignore
3
+ outfile: str = {{out.outfile | quote}} # pyright: ignore
4
4
 
5
5
  with open(outfile, "wt") as fout:
6
6
  fout.write(instr)
@@ -0,0 +1,33 @@
1
+ from pathlib import Path
2
+ from shutil import which
3
+ from diot import Diot # noqa: F401
4
+ from biopipen.utils.misc import run_command, dict_to_cli_args
5
+
6
+ infile: str = {{in.infile | quote}} # pyright: ignore # noqa
7
+ outfile: str = {{out.outfile | quote}} # pyright: ignore
8
+ envs: dict = {{envs | repr}} # pyright: ignore
9
+ tool: str = envs.pop("tool", "maxit")
10
+ maxit: str = envs.pop("maxit", "maxit")
11
+ beem = envs.pop("beem", "BeEM")
12
+
13
+ if tool == "maxit":
14
+ maxit_found = which(maxit)
15
+ if not maxit_found:
16
+ raise ValueError(f"maxit executable not found: {maxit}")
17
+
18
+ maxit_exe = Path(maxit_found).expanduser().resolve()
19
+ rcsbroot = maxit_exe.parent.parent
20
+ envs["input"] = infile
21
+ envs["output"] = outfile
22
+ envs["o"] = 2
23
+ envs["log"] = Path(outfile).with_suffix(".log")
24
+ run_command([maxit, *dict_to_cli_args(envs, prefix="-")], fg=True, env={"RCSBROOT": rcsbroot})
25
+
26
+ else:
27
+ outfile: Path = Path(outfile) # type: ignore
28
+ envs["_"] = infile
29
+ envs["p"] = outfile.parent.joinpath(outfile.stem)
30
+ envs["outfmt"] = 3
31
+ args = dict_to_cli_args(envs, prefix="-", sep="=")
32
+
33
+ run_command([beem, *args], fg=True)
@@ -0,0 +1,60 @@
1
+ # """
2
+ # LICENSE
3
+
4
+ # GNU General Public License v2.0
5
+
6
+ # The code is based on the script from:
7
+ # https://github.com/kad-ecoli/pdb2fasta/blob/master/pdb2fasta.py
8
+
9
+ # The original code is licensed under GNU General Public License v2.0.
10
+ # The original code is modified by biopipen developers to fit the biopipen.
11
+ # """
12
+ from __future__ import annotations
13
+ import re
14
+ from collections import defaultdict
15
+ from pathlib import Path
16
+
17
+ infile: str = {{in.infile | quote}} # pyright: ignore # noqa: E999
18
+ outfile: str = {{out.outfile | quote}} # pyright: ignore
19
+ chains: str | list | None = {{envs.chains | repr}} # pyright: ignore
20
+ wrap: int = {{envs.wrap | repr}} # pyright: ignore
21
+
22
+ if isinstance(chains, str):
23
+ chains = [chain.strip() for chain in chains.split(",")]
24
+
25
+ aa3to1 = {
26
+ 'ALA':'A', 'VAL':'V', 'PHE':'F', 'PRO':'P', 'MET':'M',
27
+ 'ILE':'I', 'LEU':'L', 'ASP':'D', 'GLU':'E', 'LYS':'K',
28
+ 'ARG':'R', 'SER':'S', 'THR':'T', 'TYR':'Y', 'HIS':'H',
29
+ 'CYS':'C', 'ASN':'N', 'GLN':'Q', 'TRP':'W', 'GLY':'G',
30
+ 'MSE':'M',
31
+ }
32
+
33
+ ca_pattern = re.compile(
34
+ r"^ATOM\s{2,6}\d{1,5}\s{2}CA\s[\sA]([A-Z]{3})\s([\s\w])|^HETATM\s{0,4}\d{1,5}\s{2}CA\s[\sA](MSE)\s([\s\w])" # noqa: W605
35
+ )
36
+
37
+ filename = Path(infile).stem
38
+ chain_dict = defaultdict(str)
39
+
40
+ with open(infile, 'r') as fp:
41
+ for line in fp:
42
+ if line.startswith("ENDMDL"):
43
+ break
44
+
45
+ match_list = ca_pattern.findall(line)
46
+ if match_list:
47
+ resn = match_list[0][0] + match_list[0][2]
48
+ chain = match_list[0][1] + match_list[0][3]
49
+ if chains is None or chain in chains:
50
+ chain_dict[chain] += aa3to1[resn]
51
+
52
+ with open(outfile, 'w') as fp:
53
+ for chain in chain_dict:
54
+ fp.write(f">{filename}:{chain}\n")
55
+ sequence = chain_dict[chain]
56
+ if wrap > 0:
57
+ for i in range(0, len(sequence), 80):
58
+ fp.write(sequence[i:i+80] + "\n")
59
+ else:
60
+ fp.write(sequence + "\n")
@@ -2,15 +2,15 @@ import json
2
2
  import logging
3
3
  import sys
4
4
  from pathlib import Path
5
- from prodigy_prot.predict_IC import (
5
+ from prodigy_prot.predict_IC import ( # type: ignore
6
6
  Prodigy,
7
7
  check_path,
8
8
  parse_structure,
9
9
  )
10
10
 
11
- infile = {{in.infile | repr}} # pyright: ignore # noqa
12
- outfile = {{out.outfile | repr}} # pyright: ignore
13
- outdir = {{out.outdir | repr}} # pyright: ignore
11
+ infile: str = {{in.infile | quote}} # pyright: ignore # noqa
12
+ outfile: str = {{out.outfile | quote}} # pyright: ignore
13
+ outdir: str = {{out.outdir | quote}} # pyright: ignore
14
14
  distance_cutoff = {{envs.distance_cutoff | float}} # pyright: ignore
15
15
  acc_threshold = {{envs.acc_threshold | float}} # pyright: ignore
16
16
  temperature = {{envs.temperature | float}} # pyright: ignore
@@ -0,0 +1,178 @@
1
+ from pathlib import Path
2
+ from shutil import which
3
+ from diot import Diot # noqa: F401
4
+ from biopipen.utils.misc import run_command, dict_to_cli_args
5
+
6
+ infile1: str = {{in.infile1 | quote}} # pyright: ignore # noqa
7
+ infile2: str = {{in.infile2 | quote}} # pyright: ignore # noqa
8
+ outfile: str = {{out.outfile | quote}} # pyright: ignore # noqa
9
+ outdir: str = {{job.outdir | quote}} # pyright: ignore # noqa
10
+ envs: dict = {{envs | repr}} # pyright: ignore # noqa
11
+ conv_tool = envs.pop("conv_tool", "maxit")
12
+ maxit = envs.pop("maxit", "maxit")
13
+ beem = envs.pop("beem", "BeEM")
14
+ ca_only = envs.pop("ca_only", False)
15
+ # aa20_only = envs.pop("aa20_only", False)
16
+ duel = envs.pop("duel", "keep")
17
+ calculate_rmsd = envs.pop("calculate_rmsd", "calculate_rmsd")
18
+
19
+
20
+ def cif_to_pdb(cif_file, pdb_file:Path):
21
+ if conv_tool == "maxit":
22
+ maxit_bin = Path(which(maxit)).resolve()
23
+ rcsbroot = Path(maxit_bin).parent.parent
24
+ args = {"input": cif_file, "output": pdb_file, "o": 2, "log": pdb_file.with_suffix(".log")}
25
+ run_command([maxit, *dict_to_cli_args(args, prefix="-")], fg=True, env={"RCSBROOT": rcsbroot})
26
+ else:
27
+ args = {"_": cif_file, "p": pdb_file.parent.joinpath(pdb_file.stem)}
28
+ args = dict_to_cli_args(args, prefix="-", sep="=")
29
+ run_command([beem, *args], fg=True)
30
+
31
+
32
+ def pdb_to_ca_pdb(pdb_file: Path, ca_pdb_file: Path):
33
+ """Extract C-alpha atoms from a PDB file and still keep the original order and metadata."""
34
+ with open(pdb_file, "r") as f, open(ca_pdb_file, "w") as fw:
35
+ for line in f:
36
+ if line.startswith("ATOM") and line[12:16].strip() == "CA":
37
+ fw.write(line)
38
+
39
+
40
+ # def pdb_to_aa20_pdb(pdb_file: Path, aa20_pdb_file: Path):
41
+ # """Extract the 20 amino acids from a PDB file and still keep the original order and metadata."""
42
+ # with open(pdb_file, "r") as f, open(aa20_pdb_file, "w") as fw:
43
+ # for line in f:
44
+ # if line.startswith("ATOM") and line[17:20].strip() in (
45
+ # "ALA", "ARG", "ASN", "ASP", "CYS", "GLN", "GLU", "GLY",
46
+ # "HIS", "ILE", "LEU", "LYS", "MET", "PHE", "PRO", "SER",
47
+ # "THR", "TRP", "TYR", "VAL",
48
+ # ):
49
+ # fw.write(line)
50
+
51
+
52
+ def deduel_pdb(pdb_file: Path, deduel_pdb_file: Path):
53
+ """Remove/Handle the duel atoms in a PDB file."""
54
+ def is_duel(atom1, atom2):
55
+ # 1 2
56
+ # 01234567890123456789012345
57
+ # ATOM 913 CA ATYR A 113
58
+ # ATOM 914 CA BTYR A 113
59
+ # The key should be "ATOM|CA |TYR| A| 113"
60
+ return (
61
+ atom1[:4] == atom2[:4] and
62
+ atom1[12:16] == atom2[12:16] and
63
+ atom1[17:20] == atom2[17:20] and
64
+ atom1[21] == atom2[21] and
65
+ atom1[22:26] == atom2[22:26] and
66
+ atom1[16] != atom2[16]
67
+ )
68
+
69
+ def clean_atom(atom):
70
+ return atom[:16] + " " + atom[17:]
71
+
72
+ last_atom = ""
73
+ with open(pdb_file, "r") as f, open(deduel_pdb_file, "w") as fw:
74
+ for line in f:
75
+ if not line.startswith("ATOM"):
76
+ fw.write(line)
77
+ continue
78
+ if not is_duel(last_atom, line):
79
+ if last_atom:
80
+ fw.write(clean_atom(last_atom))
81
+ last_atom = line
82
+ # is duel
83
+ elif duel == "keep":
84
+ fw.write(clean_atom(last_atom))
85
+ fw.write(clean_atom(line))
86
+ last_atom = ""
87
+ elif duel == "keep_first":
88
+ fw.write(clean_atom(last_atom))
89
+ last_atom = ""
90
+ elif duel == "keep_last":
91
+ fw.write(clean_atom(line))
92
+ last_atom = ""
93
+ elif duel == "average":
94
+ # Average the coordinates
95
+ x1 = float(last_atom[30:38])
96
+ y1 = float(last_atom[38:46])
97
+ z1 = float(last_atom[46:54])
98
+ x2 = float(line[30:38])
99
+ y2 = float(line[38:46])
100
+ z2 = float(line[46:54])
101
+ x = (x1 + x2) / 2.0
102
+ y = (y1 + y2) / 2.0
103
+ z = (z1 + z2) / 2.0
104
+ fw.write(clean_atom(last_atom[:30] + f"{x:8.3f}{y:8.3f}{z:8.3f}" + last_atom[54:]))
105
+ last_atom = ""
106
+
107
+ if last_atom:
108
+ fw.write(last_atom)
109
+
110
+
111
+ def index_of(lst, item) -> int:
112
+ try:
113
+ return lst.index(item)
114
+ except ValueError:
115
+ return -1
116
+
117
+
118
+ if infile1.endswith(".cif"):
119
+ pdb1 = Path(outdir) / f"{Path(infile1).stem}.pdb"
120
+ cif_to_pdb(infile1, pdb1)
121
+ infile1 = pdb1 # type: ignore
122
+
123
+ if infile2.endswith(".cif"):
124
+ pdb2 = Path(outdir) / f"{Path(infile2).stem}.pdb"
125
+ cif_to_pdb(infile2, pdb2)
126
+ infile2 = pdb2 # type: ignore
127
+
128
+ if ca_only:
129
+ ca_pdb1 = Path(outdir) / f"{Path(infile1).stem}.ca.pdb"
130
+ pdb_to_ca_pdb(infile1, ca_pdb1) # type: ignore
131
+ infile1 = ca_pdb1 # type: ignore
132
+
133
+ ca_pdb2 = Path(outdir) / f"{Path(infile2).stem}.ca.pdb"
134
+ pdb_to_ca_pdb(infile2, ca_pdb2) # type: ignore
135
+ infile2 = ca_pdb2 # type: ignore
136
+
137
+ # if aa20_only:
138
+ # aa20_pdb1 = Path(outdir) / f"{Path(infile1).stem}.aa20.pdb"
139
+ # pdb_to_aa20_pdb(infile1, aa20_pdb1) # type: ignore
140
+ # infile1 = aa20_pdb1 # type: ignore
141
+
142
+ # aa20_pdb2 = Path(outdir) / f"{Path(infile2).stem}.aa20.pdb"
143
+ # pdb_to_aa20_pdb(infile2, aa20_pdb2) # type: ignore
144
+ # infile2 = aa20_pdb2 # type: ignore
145
+
146
+ if duel != "keep":
147
+ deduel_pdb1 = Path(outdir) / f"{Path(infile1).stem}.deduel.pdb"
148
+ deduel_pdb(infile1, deduel_pdb1) # type: ignore
149
+ infile1 = deduel_pdb1 # type: ignore
150
+
151
+ deduel_pdb2 = Path(outdir) / f"{Path(infile2).stem}.deduel.pdb"
152
+ deduel_pdb(infile2, deduel_pdb2) # type: ignore
153
+ infile2 = deduel_pdb2 # type: ignore
154
+
155
+ envs["_"] = [infile1, infile2]
156
+ envs = dict_to_cli_args(envs, dashify=True)
157
+
158
+ idx_ur = index_of(envs, "--ur")
159
+ if idx_ur != -1:
160
+ envs[idx_ur] = "-ur"
161
+
162
+ idx_urks = index_of(envs, "--urks")
163
+ if idx_urks != -1:
164
+ envs[idx_urks] = "-urks"
165
+
166
+ idx_nh = index_of(envs, "--nh")
167
+ if idx_nh != -1:
168
+ envs[idx_nh] = "-nh"
169
+
170
+ out: str = run_command([calculate_rmsd, *envs], stdout="return") # type: ignore
171
+ out = out.strip()
172
+
173
+ try:
174
+ float(out)
175
+ except (ValueError, TypeError):
176
+ raise ValueError(out)
177
+
178
+ Path(outfile).write_text(out)
@@ -5,20 +5,20 @@ import re
5
5
  from pathlib import PosixPath # noqa: F401
6
6
  from biopipen.utils.misc import run_command, dict_to_cli_args, logger
7
7
 
8
- motiffile = {{in.motiffile | repr}} # pyright: ignore # noqa: #999
9
- seqfile = {{in.seqfile | repr}} # pyright: ignore
10
- outdir = {{out.outdir | repr}} # pyright: ignore
8
+ motiffile: str = {{in.motiffile | quote}} # pyright: ignore # noqa: #999
9
+ seqfile: str = {{in.seqfile | quote}} # pyright: ignore
10
+ outdir: str = {{out.outdir | quote}} # pyright: ignore
11
11
 
12
12
  tool = {{envs.tool | repr}} # pyright: ignore
13
13
  fimo = {{envs.fimo | repr}} # pyright: ignore
14
- motif_col = {{envs.motif_col | repr}} # pyright: ignore
15
- regulator_col = {{envs.regulator_col | repr}} # pyright: ignore
14
+ motif_col: str | int = {{envs.motif_col | repr}} # pyright: ignore
15
+ regulator_col: str | int = {{envs.regulator_col | repr}} # pyright: ignore
16
16
  notfound = {{envs.notfound | repr}} # pyright: ignore
17
- motifdb = {{envs.motifdb | repr}} # pyright: ignore
17
+ motifdb: str | None = {{envs.motifdb | repr}} # pyright: ignore
18
18
  cutoff = {{envs.cutoff | repr}} # pyright: ignore
19
19
  q = {{envs.q | repr}} # pyright: ignore
20
20
  q_cutoff = {{envs.q_cutoff | repr}} # pyright: ignore
21
- args = {{envs.args | dict | repr}} # pyright: ignore
21
+ args: dict = {{envs.args | dict | repr}} # pyright: ignore
22
22
 
23
23
  # Check if the tool is supported
24
24
  if tool != "fimo":
@@ -41,7 +41,7 @@ if isinstance(motif_col, str) or isinstance(regulator_col, str):
41
41
  with open(motiffile, "r") as f:
42
42
  header = f.readline().strip().split("\t")
43
43
  if isinstance(motif_col, str):
44
- motif_col = header.index(motif_col) + 1
44
+ motif_col: int = header.index(motif_col) + 1
45
45
  if isinstance(regulator_col, str):
46
46
  regulator_col = header.index(regulator_col) + 1
47
47
  if isinstance(motif_col, int):