biopipen 0.31.4__py3-none-any.whl → 0.31.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of biopipen might be problematic. Click here for more details.
- biopipen/__init__.py +1 -1
- biopipen/ns/bam.py +41 -0
- biopipen/ns/protein.py +84 -0
- biopipen/ns/regulatory.py +72 -0
- biopipen/ns/vcf.py +7 -3
- biopipen/reports/protein/ProdigySummary.svelte +16 -0
- biopipen/scripts/bam/BamMerge.py +10 -14
- biopipen/scripts/bam/BamSampling.py +90 -0
- biopipen/scripts/protein/Prodigy.py +119 -0
- biopipen/scripts/protein/ProdigySummary.R +133 -0
- biopipen/scripts/regulatory/MotifAffinityTest.R +5 -143
- biopipen/scripts/regulatory/MotifAffinityTest_AtSNP.R +31 -37
- biopipen/scripts/regulatory/MotifAffinityTest_MotifBreakR.R +25 -26
- biopipen/scripts/regulatory/VariantMotifPlot.R +76 -0
- biopipen/scripts/regulatory/motifs-common.R +322 -0
- biopipen/scripts/vcf/TruvariBench.sh +14 -7
- biopipen/scripts/vcf/TruvariBenchSummary.R +1 -2
- {biopipen-0.31.4.dist-info → biopipen-0.31.6.dist-info}/METADATA +1 -1
- {biopipen-0.31.4.dist-info → biopipen-0.31.6.dist-info}/RECORD +21 -16
- {biopipen-0.31.4.dist-info → biopipen-0.31.6.dist-info}/entry_points.txt +1 -0
- biopipen/scripts/regulatory/atSNP.R +0 -33
- biopipen/scripts/regulatory/motifBreakR.R +0 -1594
- {biopipen-0.31.4.dist-info → biopipen-0.31.6.dist-info}/WHEEL +0 -0
biopipen/__init__.py
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__ = "0.31.
|
|
1
|
+
__version__ = "0.31.6"
|
biopipen/ns/bam.py
CHANGED
|
@@ -260,3 +260,44 @@ class BamMerge(Proc):
|
|
|
260
260
|
"sort_args": [],
|
|
261
261
|
}
|
|
262
262
|
script = "file://../scripts/bam/BamMerge.py"
|
|
263
|
+
|
|
264
|
+
|
|
265
|
+
class BamSampling(Proc):
|
|
266
|
+
"""Keeping only a fraction of read pairs from a bam file
|
|
267
|
+
|
|
268
|
+
Input:
|
|
269
|
+
bamfile: The bam file
|
|
270
|
+
|
|
271
|
+
Output:
|
|
272
|
+
outfile: The output bam file
|
|
273
|
+
|
|
274
|
+
Envs:
|
|
275
|
+
ncores: Number of cores to use
|
|
276
|
+
samtools: Path to samtools executable
|
|
277
|
+
tool: The tool to use, currently only "samtools" is supported
|
|
278
|
+
fraction (type=float): The fraction of reads to keep.
|
|
279
|
+
If `0 < fraction <= 1`, it's the fraction of reads to keep.
|
|
280
|
+
If `fraction > 1`, it's the number of reads to keep.
|
|
281
|
+
Note that when fraction > 1, you may not get the exact number
|
|
282
|
+
of reads specified but a close number.
|
|
283
|
+
seed: The seed for random number generator
|
|
284
|
+
index: Whether to index the output bam file
|
|
285
|
+
sort: Whether to sort the output bam file
|
|
286
|
+
sort_args: The arguments for sorting bam file using `samtools sort`.
|
|
287
|
+
These keys are not allowed: `-o`, `-@`,
|
|
288
|
+
and `--threads`, as they are managed by the script.
|
|
289
|
+
"""
|
|
290
|
+
input = "bamfile:file"
|
|
291
|
+
output = "outfile:file:{{in.bamfile | stem}}.sampled{{envs.fraction}}.bam"
|
|
292
|
+
lang = config.lang.python
|
|
293
|
+
envs = {
|
|
294
|
+
"ncores": config.misc.ncores,
|
|
295
|
+
"samtools": config.exe.samtools,
|
|
296
|
+
"tool": "samtools",
|
|
297
|
+
"fraction": None,
|
|
298
|
+
"seed": 8525,
|
|
299
|
+
"index": True,
|
|
300
|
+
"sort": True,
|
|
301
|
+
"sort_args": [],
|
|
302
|
+
}
|
|
303
|
+
script = "file://../scripts/bam/BamSampling.py"
|
biopipen/ns/protein.py
ADDED
|
@@ -0,0 +1,84 @@
|
|
|
1
|
+
"""Protein-related processes."""
|
|
2
|
+
from ..core.proc import Proc
|
|
3
|
+
from ..core.config import config
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class Prodigy(Proc):
|
|
7
|
+
"""Prediction of binding affinity of protein-protein complexes based on
|
|
8
|
+
intermolecular contacts using Prodigy.
|
|
9
|
+
|
|
10
|
+
See <https://rascar.science.uu.nl/prodigy/> and
|
|
11
|
+
<https://github.com/haddocking/prodigy>.
|
|
12
|
+
|
|
13
|
+
`prodigy-prot` must be installed under the given python of `proc.lang`.
|
|
14
|
+
|
|
15
|
+
Input:
|
|
16
|
+
infile: The structure file in PDB or mmCIF format.
|
|
17
|
+
|
|
18
|
+
Output:
|
|
19
|
+
outfile: The output file generated by Prodigy.
|
|
20
|
+
outdir: The output directory containing all output files.
|
|
21
|
+
|
|
22
|
+
Envs:
|
|
23
|
+
distance_cutoff (type=float): The distance cutoff to calculate intermolecular
|
|
24
|
+
contacts.
|
|
25
|
+
acc_threshold (type=float): The accessibility threshold for BSA analysis.
|
|
26
|
+
temperature (type=float): The temperature (C) for Kd prediction.
|
|
27
|
+
contact_list (flag): Whether to generate contact list.
|
|
28
|
+
pymol_selection (flag): Whether output a script to highlight the interface
|
|
29
|
+
residues in PyMOL.
|
|
30
|
+
selection (list): The selection of the chains to analyze.
|
|
31
|
+
`['A', 'B']` will analyze chains A and B.
|
|
32
|
+
`['A,B', 'C']` will analyze chain A and C; and B and C.
|
|
33
|
+
`['A', 'B', 'C']` will analyze all combinations of A, B, and C.
|
|
34
|
+
outtype (choice): Set the format of the output file (`out.outfile`).
|
|
35
|
+
All three files will be generated. This option only determines which
|
|
36
|
+
is assigned to `out.outfile`.
|
|
37
|
+
- raw: The raw output file from prodigy.
|
|
38
|
+
- json: The output file in JSON format.
|
|
39
|
+
- tsv: The output file in CSV format.
|
|
40
|
+
"""
|
|
41
|
+
input = "infile:file"
|
|
42
|
+
output = [
|
|
43
|
+
"outfile:file:{{in.infile | stem}}_prodigy/"
|
|
44
|
+
"{{in.infile | stem}}.{{envs.outtype if envs.outtype != 'raw' else 'out'}}",
|
|
45
|
+
"outdir:dir:{{in.infile | stem}}_prodigy",
|
|
46
|
+
]
|
|
47
|
+
lang = config.lang.python
|
|
48
|
+
envs = {
|
|
49
|
+
"distance_cutoff": 5.5,
|
|
50
|
+
"acc_threshold": 0.05,
|
|
51
|
+
"temperature": 25.0,
|
|
52
|
+
"contact_list": True,
|
|
53
|
+
"pymol_selection": True,
|
|
54
|
+
"selection": None,
|
|
55
|
+
"outtype": "json",
|
|
56
|
+
}
|
|
57
|
+
script = "file://../scripts/protein/Prodigy.py"
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
class ProdigySummary(Proc):
|
|
61
|
+
"""Summary of the output from `Prodigy`.
|
|
62
|
+
|
|
63
|
+
Input:
|
|
64
|
+
infiles: The output json file generated by `Prodigy`.
|
|
65
|
+
|
|
66
|
+
Output:
|
|
67
|
+
outdir: The directory of summary files generated by `ProdigySummary`.
|
|
68
|
+
|
|
69
|
+
Envs:
|
|
70
|
+
group (type=auto): The group of the samples for boxplots.
|
|
71
|
+
If `None`, don't do boxplots.
|
|
72
|
+
It can be a dict of group names and sample names, e.g.
|
|
73
|
+
`{"group1": ["sample1", "sample2"], "group2": ["sample3"]}`
|
|
74
|
+
or a file containing the group information, with the first column
|
|
75
|
+
being the sample names and the second column being the group names.
|
|
76
|
+
The file should be tab-delimited with no header.
|
|
77
|
+
"""
|
|
78
|
+
input = "infiles:files"
|
|
79
|
+
input_data = lambda ch: [[f"{odir}/_prodigy.tsv" for odir in ch.outdir]]
|
|
80
|
+
output = "outdir:dir:prodigy_summary"
|
|
81
|
+
lang = config.lang.rscript
|
|
82
|
+
envs = {"group": None}
|
|
83
|
+
script = "file://../scripts/protein/ProdigySummary.R"
|
|
84
|
+
plugin_opts = {"report": "file://../reports/protein/ProdigySummary.svelte"}
|
biopipen/ns/regulatory.py
CHANGED
|
@@ -212,3 +212,75 @@ class MotifAffinityTest(Proc):
|
|
|
212
212
|
"atsnp_args": {"padj_cutoff": True, "padj": "BH", "p": "pval_diff"},
|
|
213
213
|
}
|
|
214
214
|
script = "file://../scripts/regulatory/MotifAffinityTest.R"
|
|
215
|
+
|
|
216
|
+
|
|
217
|
+
class VariantMotifPlot(Proc):
|
|
218
|
+
"""A plot with a genomic region surrounding a genomic variant, and
|
|
219
|
+
potentially disrupted motifs.
|
|
220
|
+
|
|
221
|
+
Currently only SNVs are supported.
|
|
222
|
+
|
|
223
|
+
Input:
|
|
224
|
+
infile: File containing the variants and motifs.
|
|
225
|
+
It is a TAB-delimited file with the following columns:
|
|
226
|
+
- chrom: The chromosome of the SNV. Alias: chr, seqnames.
|
|
227
|
+
- start: The start position of the SNV, no matter 0- or 1-based.
|
|
228
|
+
- end: The end position of the SNV, which will be used as the position of the SNV.
|
|
229
|
+
- strand: Indicating the direction of the surrounding sequence matching the motif.
|
|
230
|
+
- SNP_id: The name of the SNV.
|
|
231
|
+
- REF: The reference allele of the SNV.
|
|
232
|
+
- ALT: The alternative allele of the SNV.
|
|
233
|
+
- providerId: The motif id. It can be specified by `envs.motif_col`.
|
|
234
|
+
- providerName: The name of the motif provider. Optional.
|
|
235
|
+
- Regulator: The regulator name. Optional, can be specified by `envs.regulator_col`.
|
|
236
|
+
- motifPos: The position of the motif, relative to the position of the SNV.
|
|
237
|
+
For example, '-8, 4' means the motif is 8 bp upstream and 4 bp downstream of the SNV.
|
|
238
|
+
|
|
239
|
+
Envs:
|
|
240
|
+
genome: The genome assembly.
|
|
241
|
+
Used to fetch the sequences around the variants by package, for example, `BSgenome.Hsapiens.UCSC.hg19` is required if
|
|
242
|
+
`hg19`. If it is an organism other than human, please specify the full name of the package, for example, `BSgenome.Mmusculus.UCSC.mm10`.
|
|
243
|
+
motifdb: The path to the motif database. This is required.
|
|
244
|
+
It should be in the format of MEME motif database.
|
|
245
|
+
Databases can be downloaded here: <https://meme-suite.org/meme/doc/download.html>.
|
|
246
|
+
See also introduction to the databases: <https://meme-suite.org/meme/db/motifs>.
|
|
247
|
+
[universalmotif](https://github.com/bjmt/universalmotif) is required to read the motif database.
|
|
248
|
+
motif_col: The column name in the motif file containing the motif names.
|
|
249
|
+
If this is not provided, `envs.regulator_col` and `envs.regmotifs` are required,
|
|
250
|
+
which are used to infer the motif names from the regulator names.
|
|
251
|
+
regulator_col: The column name in the motif file containing the regulator names.
|
|
252
|
+
Both `motif_col` and `regulator_col` should be the direct column names or
|
|
253
|
+
the index (1-based) of the columns.
|
|
254
|
+
If no `regulator_col` is provided, no regulator information is written in
|
|
255
|
+
the output. Otherwise, the regulator information is written in the output in
|
|
256
|
+
the `Regulator` column.
|
|
257
|
+
regmotifs: The path to the regulator-motif mapping file.
|
|
258
|
+
It must have header and the columns `Motif` or `Model` for motif names and
|
|
259
|
+
`TF`, `Regulator` or `Transcription factor` for regulator names.
|
|
260
|
+
notfound (choice): What to do if a motif is not found in the database,
|
|
261
|
+
or a regulator is not found in the regulator-motif mapping (envs.regmotifs)
|
|
262
|
+
file.
|
|
263
|
+
- error: Report error and stop the process.
|
|
264
|
+
- ignore: Ignore the motif and continue.
|
|
265
|
+
devpars (ns): The default device parameters for the plot.
|
|
266
|
+
- width (type=int): The width of the plot.
|
|
267
|
+
- height (type=int): The height of the plot.
|
|
268
|
+
- res (type=int): The resolution of the plot.
|
|
269
|
+
plot_vars (type=auto): The variants (SNP_id) to plot.
|
|
270
|
+
A list of variant names to plot or a string with the variant names separated by comma.
|
|
271
|
+
When not specified, all variants are plotted.
|
|
272
|
+
""" # noqa: E501
|
|
273
|
+
input = "infile:file"
|
|
274
|
+
output = "outdir:dir:{{in.infile | stem}}.vmplots"
|
|
275
|
+
lang = config.lang.rscript
|
|
276
|
+
envs = {
|
|
277
|
+
"genome": config.ref.genome,
|
|
278
|
+
"motifdb": config.ref.tf_motifdb,
|
|
279
|
+
"motif_col": "providerId",
|
|
280
|
+
"regulator_col": None,
|
|
281
|
+
"regmotifs": config.ref.tf_motifs,
|
|
282
|
+
"notfound": "error",
|
|
283
|
+
"devpars": {"width": 800, "height": None, "res": 100},
|
|
284
|
+
"plot_vars": None,
|
|
285
|
+
}
|
|
286
|
+
script = "file://../scripts/regulatory/VariantMotifPlot.R"
|
biopipen/ns/vcf.py
CHANGED
|
@@ -335,6 +335,8 @@ class TruvariBench(Proc):
|
|
|
335
335
|
"""Run `truvari bench` to compare a VCF with CNV calls and
|
|
336
336
|
base CNV standards
|
|
337
337
|
|
|
338
|
+
Requires truvari v4+
|
|
339
|
+
|
|
338
340
|
See https://github.com/ACEnglish/truvari/wiki/bench
|
|
339
341
|
|
|
340
342
|
Input:
|
|
@@ -358,7 +360,7 @@ class TruvariBench(Proc):
|
|
|
358
360
|
"truvari": config.exe.truvari,
|
|
359
361
|
"ref": config.ref.reffa,
|
|
360
362
|
"refdist": 500,
|
|
361
|
-
"
|
|
363
|
+
"pctseq": 0.7,
|
|
362
364
|
"pctsize": 0.7,
|
|
363
365
|
"pctovl": 0.0,
|
|
364
366
|
"typeignore": False,
|
|
@@ -402,7 +404,7 @@ class TruvariBenchSummary(Proc):
|
|
|
402
404
|
output = "outdir:dir:truvari_bench.summary"
|
|
403
405
|
lang = config.lang.rscript
|
|
404
406
|
envs = {
|
|
405
|
-
"plots": ["
|
|
407
|
+
"plots": ["comp cnt", "base cnt", "precision", "recall", "f1"],
|
|
406
408
|
"devpars": None,
|
|
407
409
|
}
|
|
408
410
|
script = "file://../scripts/vcf/TruvariBenchSummary.R"
|
|
@@ -414,6 +416,8 @@ class TruvariConsistency(Proc):
|
|
|
414
416
|
|
|
415
417
|
See https://github.com/ACEnglish/truvari/wiki/consistency
|
|
416
418
|
|
|
419
|
+
Requires truvari v4+
|
|
420
|
+
|
|
417
421
|
Input:
|
|
418
422
|
vcfs: The vcf files with CNV calls
|
|
419
423
|
|
|
@@ -463,7 +467,7 @@ class BcftoolsAnnotate(Proc):
|
|
|
463
467
|
columns (auto): Comma-separated or list of columns or tags to carry over from
|
|
464
468
|
the annotation file. Overrides `-c, --columns`
|
|
465
469
|
remove (auto): Remove the specified columns from the input file
|
|
466
|
-
header (
|
|
470
|
+
header (list): Headers to be added
|
|
467
471
|
gz (flag): Whether to gzip the output file
|
|
468
472
|
index (flag): Whether to index the output file (tbi) (`envs.gz` forced to True)
|
|
469
473
|
<more>: Other arguments for `bcftools annotate`
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
{% from "utils/misc.liq" import report_jobs -%}
|
|
2
|
+
|
|
3
|
+
<script>
|
|
4
|
+
import { Image, DataTable, Descr } from "$libs";
|
|
5
|
+
</script>
|
|
6
|
+
|
|
7
|
+
{%- macro report_job(job, h=1) -%}
|
|
8
|
+
{{ job | render_job: h=h }}
|
|
9
|
+
{%- endmacro -%}
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
{%- macro head_job(job) -%}
|
|
13
|
+
<h1>{{job.out.outdir | stem | escape}}</h1>
|
|
14
|
+
{%- endmacro -%}
|
|
15
|
+
|
|
16
|
+
{{ report_jobs(jobs, head_job, report_job) }}
|
biopipen/scripts/bam/BamMerge.py
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
from pathlib import Path
|
|
2
|
-
from biopipen.utils.misc import run_command
|
|
2
|
+
from biopipen.utils.misc import run_command, logger
|
|
3
3
|
|
|
4
|
-
bamfiles = {{in.bamfiles | repr}} # pyright: ignore
|
|
4
|
+
bamfiles = {{in.bamfiles | repr}} # pyright: ignore # noqa
|
|
5
5
|
outfile = Path({{out.outfile | repr}}) # pyright: ignore
|
|
6
6
|
ncores = {{envs.ncores | int}} # pyright: ignore
|
|
7
7
|
tool = {{envs.tool | quote}} # pyright: ignore
|
|
@@ -18,7 +18,7 @@ if should_index and not should_sort:
|
|
|
18
18
|
|
|
19
19
|
def use_samtools():
|
|
20
20
|
"""Use samtools to merge bam files"""
|
|
21
|
-
|
|
21
|
+
logger.info("Using samtools ...")
|
|
22
22
|
ofile = (
|
|
23
23
|
outfile
|
|
24
24
|
if not should_sort
|
|
@@ -43,11 +43,11 @@ def use_samtools():
|
|
|
43
43
|
*merge_args,
|
|
44
44
|
*bamfiles,
|
|
45
45
|
]
|
|
46
|
-
|
|
46
|
+
logger.info("- Merging the bam files ...")
|
|
47
47
|
run_command(cmd)
|
|
48
48
|
|
|
49
49
|
if should_sort:
|
|
50
|
-
|
|
50
|
+
logger.info("- Sorting the merged bam file ...")
|
|
51
51
|
for key in ["-o", "-@", "--threads"]:
|
|
52
52
|
if key in sort_args:
|
|
53
53
|
raise ValueError(
|
|
@@ -67,16 +67,14 @@ def use_samtools():
|
|
|
67
67
|
run_command(cmd)
|
|
68
68
|
|
|
69
69
|
if should_index:
|
|
70
|
-
|
|
70
|
+
logger.info("- Indexing the output bam file ...")
|
|
71
71
|
cmd = [samtools, "index", "-@", ncores, outfile]
|
|
72
72
|
run_command(cmd)
|
|
73
73
|
|
|
74
|
-
print("Done")
|
|
75
|
-
|
|
76
74
|
|
|
77
75
|
def use_sambamba():
|
|
78
76
|
"""Use sambamba to merge bam files"""
|
|
79
|
-
|
|
77
|
+
logger.info("Using sambamba ...")
|
|
80
78
|
ofile = (
|
|
81
79
|
outfile
|
|
82
80
|
if not should_sort
|
|
@@ -90,11 +88,11 @@ def use_sambamba():
|
|
|
90
88
|
)
|
|
91
89
|
|
|
92
90
|
cmd = [sambamba, "merge", "-t", ncores, *merge_args, ofile, *bamfiles]
|
|
93
|
-
|
|
91
|
+
logger.info("- Merging the bam files ...")
|
|
94
92
|
run_command(cmd)
|
|
95
93
|
|
|
96
94
|
if should_sort:
|
|
97
|
-
|
|
95
|
+
logger.info("- Sorting the merged bam file ...")
|
|
98
96
|
for key in ["-t", "--nthreads", "-o", "--out"]:
|
|
99
97
|
if key in sort_args:
|
|
100
98
|
raise ValueError(
|
|
@@ -115,12 +113,10 @@ def use_sambamba():
|
|
|
115
113
|
run_command(cmd)
|
|
116
114
|
|
|
117
115
|
if should_index:
|
|
118
|
-
|
|
116
|
+
logger.info("- Indexing the output bam file ...")
|
|
119
117
|
cmd = [sambamba, "index", "-t", ncores, outfile]
|
|
120
118
|
run_command(cmd)
|
|
121
119
|
|
|
122
|
-
print("Done")
|
|
123
|
-
|
|
124
120
|
|
|
125
121
|
if __name__ == "__main__":
|
|
126
122
|
if tool == "samtools":
|
|
@@ -0,0 +1,90 @@
|
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
from biopipen.utils.misc import run_command, logger
|
|
3
|
+
|
|
4
|
+
# using:
|
|
5
|
+
# samtools view --subsample 0.1 --subsample-seed 1234 --threads 4 -b -o out.bam in.bam
|
|
6
|
+
|
|
7
|
+
bamfile = {{ in.bamfile | repr }} # pyright: ignore # noqa
|
|
8
|
+
outfile = Path({{ out.outfile | repr }}) # pyright: ignore
|
|
9
|
+
ncores = {{ envs.ncores | int }} # pyright: ignore
|
|
10
|
+
samtools = {{ envs.samtools | repr }} # pyright: ignore
|
|
11
|
+
tool = {{ envs.tool | repr }} # pyright: ignore
|
|
12
|
+
fraction = {{ envs.fraction | repr }} # pyright: ignore
|
|
13
|
+
seed = {{ envs.seed | int }} # pyright: ignore
|
|
14
|
+
should_index = {{ envs.index | repr }} # pyright: ignore
|
|
15
|
+
should_sort = {{ envs.sort | repr }} # pyright: ignore
|
|
16
|
+
sort_args = {{ envs.sort_args | repr }} # pyright: ignore
|
|
17
|
+
|
|
18
|
+
if should_index and not should_sort:
|
|
19
|
+
raise ValueError("Indexing requires sorting")
|
|
20
|
+
|
|
21
|
+
if fraction is None:
|
|
22
|
+
raise ValueError("'envs.fraction' must be provided.")
|
|
23
|
+
|
|
24
|
+
if tool != "samtools":
|
|
25
|
+
raise ValueError(
|
|
26
|
+
f"Tool {tool} is not supported. "
|
|
27
|
+
"Currently only samtools is supported."
|
|
28
|
+
)
|
|
29
|
+
|
|
30
|
+
if fraction > 1:
|
|
31
|
+
# calculate the fraction based on the number of reads
|
|
32
|
+
logger.info("Converting fraction > 1 to a fraction of reads.")
|
|
33
|
+
cmd = [
|
|
34
|
+
samtools,
|
|
35
|
+
"view",
|
|
36
|
+
"--threads",
|
|
37
|
+
ncores,
|
|
38
|
+
"-c",
|
|
39
|
+
bamfile
|
|
40
|
+
]
|
|
41
|
+
nreads = run_command(cmd, stdout="return").strip()
|
|
42
|
+
fraction = fraction / float(int(nreads))
|
|
43
|
+
|
|
44
|
+
ofile = (
|
|
45
|
+
outfile
|
|
46
|
+
if not should_sort
|
|
47
|
+
else outfile.with_stem(f"{outfile.stem}.unsorted")
|
|
48
|
+
)
|
|
49
|
+
|
|
50
|
+
cmd = [
|
|
51
|
+
samtools,
|
|
52
|
+
"view",
|
|
53
|
+
"--subsample",
|
|
54
|
+
fraction,
|
|
55
|
+
"--subsample-seed",
|
|
56
|
+
seed,
|
|
57
|
+
"--threads",
|
|
58
|
+
ncores,
|
|
59
|
+
"-b",
|
|
60
|
+
"-o",
|
|
61
|
+
ofile,
|
|
62
|
+
bamfile
|
|
63
|
+
]
|
|
64
|
+
run_command(cmd, fg=True)
|
|
65
|
+
|
|
66
|
+
if should_sort:
|
|
67
|
+
logger.info("Sorting the output bam file.")
|
|
68
|
+
for key in ["-o", "-@", "--threads"]:
|
|
69
|
+
if key in sort_args:
|
|
70
|
+
raise ValueError(
|
|
71
|
+
f"envs.sort_args cannot contain {key}, "
|
|
72
|
+
"which is managed by the script"
|
|
73
|
+
)
|
|
74
|
+
|
|
75
|
+
cmd = [
|
|
76
|
+
samtools,
|
|
77
|
+
"sort",
|
|
78
|
+
"-@",
|
|
79
|
+
ncores,
|
|
80
|
+
*sort_args,
|
|
81
|
+
"-o",
|
|
82
|
+
outfile,
|
|
83
|
+
ofile
|
|
84
|
+
]
|
|
85
|
+
run_command(cmd, fg=True)
|
|
86
|
+
|
|
87
|
+
if should_index:
|
|
88
|
+
logger.info("Indexing the output bam file.")
|
|
89
|
+
cmd = [samtools, "index", "-@", ncores, outfile]
|
|
90
|
+
run_command(cmd, fg=True)
|
|
@@ -0,0 +1,119 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import logging
|
|
3
|
+
import sys
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from prodigy_prot.predict_IC import (
|
|
6
|
+
Prodigy,
|
|
7
|
+
check_path,
|
|
8
|
+
parse_structure,
|
|
9
|
+
)
|
|
10
|
+
|
|
11
|
+
infile = {{in.infile | repr}} # pyright: ignore # noqa
|
|
12
|
+
outfile = {{out.outfile | repr}} # pyright: ignore
|
|
13
|
+
outdir = {{out.outdir | repr}} # pyright: ignore
|
|
14
|
+
distance_cutoff = {{envs.distance_cutoff | float}} # pyright: ignore
|
|
15
|
+
acc_threshold = {{envs.acc_threshold | float}} # pyright: ignore
|
|
16
|
+
temperature = {{envs.temperature | float}} # pyright: ignore
|
|
17
|
+
contact_list = {{envs.contact_list | repr}} # pyright: ignore
|
|
18
|
+
pymol_selection = {{envs.pymol_selection | repr}} # pyright: ignore
|
|
19
|
+
selection = {{envs.selection | repr}} # pyright: ignore
|
|
20
|
+
outtype = {{envs.outtype | repr}} # pyright: ignore
|
|
21
|
+
|
|
22
|
+
raw_outfile = Path(outdir) / "_prodigy_raw.txt"
|
|
23
|
+
json_outfile = Path(outdir) / "_prodigy.json"
|
|
24
|
+
tsv_outfile = Path(outdir) / "_prodigy.tsv"
|
|
25
|
+
|
|
26
|
+
# log to the raw_outfile
|
|
27
|
+
logging.basicConfig(level=logging.INFO, stream=sys.stdout, format="%(message)s")
|
|
28
|
+
logger = logging.getLogger("Prodigy")
|
|
29
|
+
|
|
30
|
+
if isinstance(selection, str):
|
|
31
|
+
selection = [selection]
|
|
32
|
+
|
|
33
|
+
struct_path = check_path(infile)
|
|
34
|
+
|
|
35
|
+
# parse structure
|
|
36
|
+
structure, n_chains, n_res = parse_structure(struct_path)
|
|
37
|
+
logger.info(
|
|
38
|
+
"[+] Parsed structure file {0} ({1} chains, {2} residues)".format(
|
|
39
|
+
structure.id, n_chains, n_res
|
|
40
|
+
)
|
|
41
|
+
)
|
|
42
|
+
prodigy = Prodigy(structure, selection, temperature)
|
|
43
|
+
prodigy.predict(distance_cutoff=distance_cutoff, acc_threshold=acc_threshold)
|
|
44
|
+
prodigy.print_prediction(outfile=raw_outfile, quiet=False)
|
|
45
|
+
|
|
46
|
+
# Print out interaction network
|
|
47
|
+
if contact_list:
|
|
48
|
+
prodigy.print_contacts(f"{outdir}/prodigy.ic")
|
|
49
|
+
|
|
50
|
+
# Print out interaction network
|
|
51
|
+
if pymol_selection:
|
|
52
|
+
prodigy.print_pymol_script(f"{outdir}/prodigy.pml")
|
|
53
|
+
|
|
54
|
+
# [+] Reading structure file: <path/to/structure.cif>
|
|
55
|
+
# [+] Parsed structure file <structure> (4 chains, 411 residues)
|
|
56
|
+
# [+] No. of intermolecular contacts: 191
|
|
57
|
+
# [+] No. of charged-charged contacts: 17
|
|
58
|
+
# [+] No. of charged-polar contacts: 18
|
|
59
|
+
# [+] No. of charged-apolar contacts: 60
|
|
60
|
+
# [+] No. of polar-polar contacts: 5
|
|
61
|
+
# [+] No. of apolar-polar contacts: 41
|
|
62
|
+
# [+] No. of apolar-apolar contacts: 50
|
|
63
|
+
# [+] Percentage of apolar NIS residues: 33.90
|
|
64
|
+
# [+] Percentage of charged NIS residues: 30.48
|
|
65
|
+
# [++] Predicted binding affinity (kcal.mol-1): -21.3
|
|
66
|
+
# [++] Predicted dissociation constant (M) at 25.0˚C: 2.3e-16
|
|
67
|
+
|
|
68
|
+
output = {}
|
|
69
|
+
with open(raw_outfile, "r") as f:
|
|
70
|
+
for line in f:
|
|
71
|
+
if line.startswith("[+"):
|
|
72
|
+
line = line.lstrip("[").lstrip("+").lstrip("]").lstrip()
|
|
73
|
+
if line.startswith("Reading structure file"):
|
|
74
|
+
continue
|
|
75
|
+
if line.startswith("Parsed structure file"):
|
|
76
|
+
continue
|
|
77
|
+
|
|
78
|
+
key, value = line.split(":", 1)
|
|
79
|
+
key = key.strip()
|
|
80
|
+
value = value.strip()
|
|
81
|
+
if key == "No. of intermolecular contacts":
|
|
82
|
+
output["nIC"] = int(value)
|
|
83
|
+
elif key == "No. of charged-charged contacts":
|
|
84
|
+
output["nCCC"] = int(value)
|
|
85
|
+
elif key == "No. of charged-polar contacts":
|
|
86
|
+
output["nCPC"] = int(value)
|
|
87
|
+
elif key == "No. of charged-apolar contacts":
|
|
88
|
+
output["nCAPC"] = int(value)
|
|
89
|
+
elif key == "No. of polar-polar contacts":
|
|
90
|
+
output["nPPC"] = int(value)
|
|
91
|
+
elif key == "No. of apolar-polar contacts":
|
|
92
|
+
output["nAPPC"] = int(value)
|
|
93
|
+
elif key == "No. of apolar-apolar contacts":
|
|
94
|
+
output["nAPAPC"] = int(value)
|
|
95
|
+
elif key.startswith("Percentage of apolar NIS residues"):
|
|
96
|
+
output["pANISR"] = float(value)
|
|
97
|
+
elif key.startswith("Percentage of charged NIS residues"):
|
|
98
|
+
output["pCNISR"] = float(value)
|
|
99
|
+
elif key.startswith("Predicted binding affinity"):
|
|
100
|
+
output["BindingAffinity"] = float(value)
|
|
101
|
+
elif key.startswith("Predicted dissociation constant"):
|
|
102
|
+
output["DissociationConstant"] = float(value)
|
|
103
|
+
|
|
104
|
+
with open(json_outfile, "w") as f:
|
|
105
|
+
json.dump(output, f, indent=2)
|
|
106
|
+
|
|
107
|
+
with open(tsv_outfile, "w") as f:
|
|
108
|
+
f.write("\t".join(output.keys()) + "\n")
|
|
109
|
+
f.write("\t".join(map(str, output.values())) + "\n")
|
|
110
|
+
|
|
111
|
+
if outtype == "json":
|
|
112
|
+
json_outfile.rename(outfile)
|
|
113
|
+
json_outfile.symlink_to(outfile)
|
|
114
|
+
elif outtype == "tsv":
|
|
115
|
+
tsv_outfile.rename(outfile)
|
|
116
|
+
tsv_outfile.symlink_to(outfile)
|
|
117
|
+
else:
|
|
118
|
+
raw_outfile.rename(outfile)
|
|
119
|
+
raw_outfile.symlink_to(outfile)
|