mgnify-pipelines-toolkit 1.0.3__tar.gz → 1.0.5__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mgnify-pipelines-toolkit might be problematic. Click here for more details.
- {mgnify_pipelines_toolkit-1.0.3 → mgnify_pipelines_toolkit-1.0.5}/PKG-INFO +19 -27
- {mgnify_pipelines_toolkit-1.0.3 → mgnify_pipelines_toolkit-1.0.5}/README.md +2 -1
- {mgnify_pipelines_toolkit-1.0.3 → mgnify_pipelines_toolkit-1.0.5}/mgnify_pipelines_toolkit/analysis/assembly/add_rhea_chebi_annotation.py +5 -1
- {mgnify_pipelines_toolkit-1.0.3 → mgnify_pipelines_toolkit-1.0.5}/mgnify_pipelines_toolkit/analysis/assembly/gff_annotation_utils.py +84 -21
- {mgnify_pipelines_toolkit-1.0.3 → mgnify_pipelines_toolkit-1.0.5}/mgnify_pipelines_toolkit/analysis/assembly/gff_file_utils.py +11 -0
- {mgnify_pipelines_toolkit-1.0.3 → mgnify_pipelines_toolkit-1.0.5}/mgnify_pipelines_toolkit/analysis/assembly/gff_toolkit.py +25 -7
- {mgnify_pipelines_toolkit-1.0.3 → mgnify_pipelines_toolkit-1.0.5}/mgnify_pipelines_toolkit/analysis/assembly/krona_txt_from_cat_classification.py +13 -9
- mgnify_pipelines_toolkit-1.0.5/mgnify_pipelines_toolkit/analysis/assembly/process_dbcan_result_cazys.py +211 -0
- mgnify_pipelines_toolkit-1.0.5/mgnify_pipelines_toolkit/analysis/assembly/process_dbcan_result_clusters.py +162 -0
- mgnify_pipelines_toolkit-1.0.5/mgnify_pipelines_toolkit/analysis/assembly/summarise_antismash_bgcs.py +230 -0
- mgnify_pipelines_toolkit-1.0.5/mgnify_pipelines_toolkit/analysis/assembly/summarise_sanntis_bgcs.py +119 -0
- {mgnify_pipelines_toolkit-1.0.3 → mgnify_pipelines_toolkit-1.0.5}/mgnify_pipelines_toolkit/analysis/shared/convert_cmscan_to_cmsearch_tblout.py +6 -3
- {mgnify_pipelines_toolkit-1.0.3 → mgnify_pipelines_toolkit-1.0.5}/mgnify_pipelines_toolkit/analysis/shared/get_subunits.py +1 -1
- {mgnify_pipelines_toolkit-1.0.3 → mgnify_pipelines_toolkit-1.0.5}/mgnify_pipelines_toolkit.egg-info/PKG-INFO +19 -27
- {mgnify_pipelines_toolkit-1.0.3 → mgnify_pipelines_toolkit-1.0.5}/mgnify_pipelines_toolkit.egg-info/SOURCES.txt +4 -0
- {mgnify_pipelines_toolkit-1.0.3 → mgnify_pipelines_toolkit-1.0.5}/mgnify_pipelines_toolkit.egg-info/entry_points.txt +5 -0
- mgnify_pipelines_toolkit-1.0.5/mgnify_pipelines_toolkit.egg-info/requires.txt +20 -0
- {mgnify_pipelines_toolkit-1.0.3 → mgnify_pipelines_toolkit-1.0.5}/pyproject.toml +22 -26
- mgnify_pipelines_toolkit-1.0.3/mgnify_pipelines_toolkit.egg-info/requires.txt +0 -29
- {mgnify_pipelines_toolkit-1.0.3 → mgnify_pipelines_toolkit-1.0.5}/LICENSE +0 -0
- {mgnify_pipelines_toolkit-1.0.3 → mgnify_pipelines_toolkit-1.0.5}/mgnify_pipelines_toolkit/__init__.py +0 -0
- {mgnify_pipelines_toolkit-1.0.3 → mgnify_pipelines_toolkit-1.0.5}/mgnify_pipelines_toolkit/analysis/__init__.py +0 -0
- {mgnify_pipelines_toolkit-1.0.3 → mgnify_pipelines_toolkit-1.0.5}/mgnify_pipelines_toolkit/analysis/amplicon/amplicon_utils.py +0 -0
- {mgnify_pipelines_toolkit-1.0.3 → mgnify_pipelines_toolkit-1.0.5}/mgnify_pipelines_toolkit/analysis/amplicon/are_there_primers.py +0 -0
- {mgnify_pipelines_toolkit-1.0.3 → mgnify_pipelines_toolkit-1.0.5}/mgnify_pipelines_toolkit/analysis/amplicon/assess_inflection_point_mcp.py +0 -0
- {mgnify_pipelines_toolkit-1.0.3 → mgnify_pipelines_toolkit-1.0.5}/mgnify_pipelines_toolkit/analysis/amplicon/assess_mcp_proportions.py +0 -0
- {mgnify_pipelines_toolkit-1.0.3 → mgnify_pipelines_toolkit-1.0.5}/mgnify_pipelines_toolkit/analysis/amplicon/classify_var_regions.py +0 -0
- {mgnify_pipelines_toolkit-1.0.3 → mgnify_pipelines_toolkit-1.0.5}/mgnify_pipelines_toolkit/analysis/amplicon/find_mcp_inflection_points.py +0 -0
- {mgnify_pipelines_toolkit-1.0.3 → mgnify_pipelines_toolkit-1.0.5}/mgnify_pipelines_toolkit/analysis/amplicon/make_asv_count_table.py +0 -0
- {mgnify_pipelines_toolkit-1.0.3 → mgnify_pipelines_toolkit-1.0.5}/mgnify_pipelines_toolkit/analysis/amplicon/mapseq_to_asv_table.py +0 -0
- {mgnify_pipelines_toolkit-1.0.3 → mgnify_pipelines_toolkit-1.0.5}/mgnify_pipelines_toolkit/analysis/amplicon/primer_val_classification.py +0 -0
- {mgnify_pipelines_toolkit-1.0.3 → mgnify_pipelines_toolkit-1.0.5}/mgnify_pipelines_toolkit/analysis/amplicon/remove_ambiguous_reads.py +0 -0
- {mgnify_pipelines_toolkit-1.0.3 → mgnify_pipelines_toolkit-1.0.5}/mgnify_pipelines_toolkit/analysis/amplicon/rev_comp_se_primers.py +0 -0
- {mgnify_pipelines_toolkit-1.0.3 → mgnify_pipelines_toolkit-1.0.5}/mgnify_pipelines_toolkit/analysis/amplicon/standard_primer_matching.py +0 -0
- {mgnify_pipelines_toolkit-1.0.3 → mgnify_pipelines_toolkit-1.0.5}/mgnify_pipelines_toolkit/analysis/assembly/antismash_gff_builder.py +0 -0
- {mgnify_pipelines_toolkit-1.0.3 → mgnify_pipelines_toolkit-1.0.5}/mgnify_pipelines_toolkit/analysis/assembly/combined_gene_caller_merge.py +0 -0
- {mgnify_pipelines_toolkit-1.0.3 → mgnify_pipelines_toolkit-1.0.5}/mgnify_pipelines_toolkit/analysis/assembly/generate_gaf.py +0 -0
- {mgnify_pipelines_toolkit-1.0.3 → mgnify_pipelines_toolkit-1.0.5}/mgnify_pipelines_toolkit/analysis/assembly/go_utils.py +0 -0
- {mgnify_pipelines_toolkit-1.0.3 → mgnify_pipelines_toolkit-1.0.5}/mgnify_pipelines_toolkit/analysis/assembly/summarise_goslims.py +0 -0
- {mgnify_pipelines_toolkit-1.0.3 → mgnify_pipelines_toolkit-1.0.5}/mgnify_pipelines_toolkit/analysis/genomes/__init__.py +0 -0
- {mgnify_pipelines_toolkit-1.0.3 → mgnify_pipelines_toolkit-1.0.5}/mgnify_pipelines_toolkit/analysis/shared/__init__.py +0 -0
- {mgnify_pipelines_toolkit-1.0.3 → mgnify_pipelines_toolkit-1.0.5}/mgnify_pipelines_toolkit/analysis/shared/dwc_summary_generator.py +0 -0
- {mgnify_pipelines_toolkit-1.0.3 → mgnify_pipelines_toolkit-1.0.5}/mgnify_pipelines_toolkit/analysis/shared/fastq_suffix_header_check.py +0 -0
- {mgnify_pipelines_toolkit-1.0.3 → mgnify_pipelines_toolkit-1.0.5}/mgnify_pipelines_toolkit/analysis/shared/get_subunits_coords.py +0 -0
- {mgnify_pipelines_toolkit-1.0.3 → mgnify_pipelines_toolkit-1.0.5}/mgnify_pipelines_toolkit/analysis/shared/library_strategy_check.py +0 -0
- {mgnify_pipelines_toolkit-1.0.3 → mgnify_pipelines_toolkit-1.0.5}/mgnify_pipelines_toolkit/analysis/shared/mapseq2biom.py +0 -0
- {mgnify_pipelines_toolkit-1.0.3 → mgnify_pipelines_toolkit-1.0.5}/mgnify_pipelines_toolkit/analysis/shared/markergene_study_summary.py +0 -0
- {mgnify_pipelines_toolkit-1.0.3 → mgnify_pipelines_toolkit-1.0.5}/mgnify_pipelines_toolkit/analysis/shared/study_summary_generator.py +0 -0
- {mgnify_pipelines_toolkit-1.0.3 → mgnify_pipelines_toolkit-1.0.5}/mgnify_pipelines_toolkit/constants/db_labels.py +0 -0
- {mgnify_pipelines_toolkit-1.0.3 → mgnify_pipelines_toolkit-1.0.5}/mgnify_pipelines_toolkit/constants/ncrna.py +0 -0
- {mgnify_pipelines_toolkit-1.0.3 → mgnify_pipelines_toolkit-1.0.5}/mgnify_pipelines_toolkit/constants/regex_ambiguous_bases.py +0 -0
- {mgnify_pipelines_toolkit-1.0.3 → mgnify_pipelines_toolkit-1.0.5}/mgnify_pipelines_toolkit/constants/regex_fasta_header.py +0 -0
- {mgnify_pipelines_toolkit-1.0.3 → mgnify_pipelines_toolkit-1.0.5}/mgnify_pipelines_toolkit/constants/tax_ranks.py +0 -0
- {mgnify_pipelines_toolkit-1.0.3 → mgnify_pipelines_toolkit-1.0.5}/mgnify_pipelines_toolkit/constants/thresholds.py +1 -1
- {mgnify_pipelines_toolkit-1.0.3 → mgnify_pipelines_toolkit-1.0.5}/mgnify_pipelines_toolkit/constants/var_region_coordinates.py +0 -0
- {mgnify_pipelines_toolkit-1.0.3 → mgnify_pipelines_toolkit-1.0.5}/mgnify_pipelines_toolkit/schemas/schemas.py +0 -0
- {mgnify_pipelines_toolkit-1.0.3 → mgnify_pipelines_toolkit-1.0.5}/mgnify_pipelines_toolkit/utils/__init__.py +0 -0
- {mgnify_pipelines_toolkit-1.0.3 → mgnify_pipelines_toolkit-1.0.5}/mgnify_pipelines_toolkit/utils/fasta_to_delimited.py +0 -0
- {mgnify_pipelines_toolkit-1.0.3 → mgnify_pipelines_toolkit-1.0.5}/mgnify_pipelines_toolkit/utils/get_mpt_version.py +0 -0
- {mgnify_pipelines_toolkit-1.0.3 → mgnify_pipelines_toolkit-1.0.5}/mgnify_pipelines_toolkit.egg-info/dependency_links.txt +0 -0
- {mgnify_pipelines_toolkit-1.0.3 → mgnify_pipelines_toolkit-1.0.5}/mgnify_pipelines_toolkit.egg-info/top_level.txt +0 -0
- {mgnify_pipelines_toolkit-1.0.3 → mgnify_pipelines_toolkit-1.0.5}/setup.cfg +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: mgnify_pipelines_toolkit
|
|
3
|
-
Version: 1.0.
|
|
3
|
+
Version: 1.0.5
|
|
4
4
|
Summary: Collection of scripts and tools for MGnify pipelines
|
|
5
5
|
Author-email: MGnify team <metagenomics-help@ebi.ac.uk>
|
|
6
6
|
License: Apache Software License 2.0
|
|
@@ -11,33 +11,24 @@ Classifier: Operating System :: OS Independent
|
|
|
11
11
|
Requires-Python: >=3.9
|
|
12
12
|
Description-Content-Type: text/markdown
|
|
13
13
|
License-File: LICENSE
|
|
14
|
-
Requires-Dist: biopython
|
|
15
|
-
Requires-Dist: numpy
|
|
16
|
-
Requires-Dist: pandas
|
|
17
|
-
Requires-Dist: regex
|
|
18
|
-
Requires-Dist: requests
|
|
19
|
-
Requires-Dist: click
|
|
20
|
-
Requires-Dist: pandera
|
|
21
|
-
Requires-Dist: pyfastx
|
|
22
|
-
Requires-Dist: intervaltree
|
|
14
|
+
Requires-Dist: biopython>=1.85
|
|
15
|
+
Requires-Dist: numpy<3,>=2.2.4
|
|
16
|
+
Requires-Dist: pandas<3,>=2.2.3
|
|
17
|
+
Requires-Dist: regex>=2024.11.6
|
|
18
|
+
Requires-Dist: requests<3,>=2.32.3
|
|
19
|
+
Requires-Dist: click<9,>=8.1.8
|
|
20
|
+
Requires-Dist: pandera<0.24,>=0.23.1
|
|
21
|
+
Requires-Dist: pyfastx<3,>=2.2.0
|
|
22
|
+
Requires-Dist: intervaltree<4,>=3.1.0
|
|
23
23
|
Provides-Extra: tests
|
|
24
|
-
Requires-Dist: pytest
|
|
25
|
-
Requires-Dist: pytest-md
|
|
26
|
-
Requires-Dist: pytest-workflow==2.0
|
|
27
|
-
Requires-Dist: biopython==1.82; extra == "tests"
|
|
28
|
-
Requires-Dist: pandas==2.0.2; extra == "tests"
|
|
29
|
-
Requires-Dist: numpy==1.26.0; extra == "tests"
|
|
30
|
-
Requires-Dist: regex==2023.12.25; extra == "tests"
|
|
31
|
-
Requires-Dist: requests==2.32.3; extra == "tests"
|
|
32
|
-
Requires-Dist: click==8.1.7; extra == "tests"
|
|
33
|
-
Requires-Dist: pandera==0.22.1; extra == "tests"
|
|
34
|
-
Requires-Dist: pyfastx>=2.2.0; extra == "tests"
|
|
24
|
+
Requires-Dist: pytest<9,>=8.3.5; extra == "tests"
|
|
25
|
+
Requires-Dist: pytest-md>=0.2.0; extra == "tests"
|
|
26
|
+
Requires-Dist: pytest-workflow==2.1.0; extra == "tests"
|
|
35
27
|
Provides-Extra: dev
|
|
36
|
-
Requires-Dist:
|
|
37
|
-
Requires-Dist:
|
|
38
|
-
Requires-Dist:
|
|
39
|
-
Requires-Dist:
|
|
40
|
-
Requires-Dist: pep8-naming==0.14.1; extra == "dev"
|
|
28
|
+
Requires-Dist: pre-commit>=4.2.0; extra == "dev"
|
|
29
|
+
Requires-Dist: black>=25.1.0; extra == "dev"
|
|
30
|
+
Requires-Dist: flake8>=7.1.2; extra == "dev"
|
|
31
|
+
Requires-Dist: pep8-naming>=0.14.1; extra == "dev"
|
|
41
32
|
Dynamic: license-file
|
|
42
33
|
|
|
43
34
|
# mgnify-pipelines-toolkit
|
|
@@ -74,8 +65,9 @@ Before starting any development, you should do these few steps:
|
|
|
74
65
|
- Clone the repo if you haven't already and create a feature branch from the `dev` branch (NOT `main`).
|
|
75
66
|
- Create a virtual environment with the tool of your choice (i.e. `conda create --name my_new_env`)
|
|
76
67
|
- Activate you new environment (i.e. `conda activate my_new_env`)
|
|
77
|
-
- Install dev dependencies `pip install -e '.[dev]'`
|
|
68
|
+
- Install dev dependencies `pip install -e '.[tests,dev]'`
|
|
78
69
|
- Install pre-commit hooks `pre-commit install`
|
|
70
|
+
- Run unit tests `pytest`
|
|
79
71
|
|
|
80
72
|
When doing these steps above, you ensure that the code you add will be linted and formatted properly.
|
|
81
73
|
|
|
@@ -32,8 +32,9 @@ Before starting any development, you should do these few steps:
|
|
|
32
32
|
- Clone the repo if you haven't already and create a feature branch from the `dev` branch (NOT `main`).
|
|
33
33
|
- Create a virtual environment with the tool of your choice (i.e. `conda create --name my_new_env`)
|
|
34
34
|
- Activate you new environment (i.e. `conda activate my_new_env`)
|
|
35
|
-
- Install dev dependencies `pip install -e '.[dev]'`
|
|
35
|
+
- Install dev dependencies `pip install -e '.[tests,dev]'`
|
|
36
36
|
- Install pre-commit hooks `pre-commit install`
|
|
37
|
+
- Run unit tests `pytest`
|
|
37
38
|
|
|
38
39
|
When doing these steps above, you ensure that the code you add will be linted and formatted properly.
|
|
39
40
|
|
|
@@ -78,7 +78,11 @@ def main():
|
|
|
78
78
|
"--output",
|
|
79
79
|
required=True,
|
|
80
80
|
type=Path,
|
|
81
|
-
help=
|
|
81
|
+
help=(
|
|
82
|
+
"Output TSV file with columns: contig_id, protein_id, protein hash, "
|
|
83
|
+
"Rhea IDs, CHEBI reaction, reaction definition, 'top hit' if it is "
|
|
84
|
+
"the first hit for the protein"
|
|
85
|
+
),
|
|
82
86
|
)
|
|
83
87
|
parser.add_argument(
|
|
84
88
|
"-p",
|
|
@@ -17,8 +17,19 @@
|
|
|
17
17
|
|
|
18
18
|
import re
|
|
19
19
|
import sys
|
|
20
|
+
import fileinput
|
|
20
21
|
|
|
21
|
-
from mgnify_pipelines_toolkit.constants.thresholds import
|
|
22
|
+
from mgnify_pipelines_toolkit.constants.thresholds import (
|
|
23
|
+
EVALUE_CUTOFF_IPS,
|
|
24
|
+
EVALUE_CUTOFF_EGGNOG,
|
|
25
|
+
)
|
|
26
|
+
|
|
27
|
+
DBCAN_CLASSES_DICT = {
|
|
28
|
+
"TC": "dbcan_transporter_classification",
|
|
29
|
+
"TF": "dbcan_transcription_factor",
|
|
30
|
+
"STP": "dbcan_signal_transduction_prot",
|
|
31
|
+
"CAZyme": "dbcan_prot_family",
|
|
32
|
+
}
|
|
22
33
|
|
|
23
34
|
|
|
24
35
|
def get_iprs(ipr_annot):
|
|
@@ -26,7 +37,8 @@ def get_iprs(ipr_annot):
|
|
|
26
37
|
antifams = list()
|
|
27
38
|
if not ipr_annot:
|
|
28
39
|
return iprs, antifams
|
|
29
|
-
with
|
|
40
|
+
with fileinput.hook_compressed(ipr_annot, "r", encoding="utf-8") as f:
|
|
41
|
+
|
|
30
42
|
for line in f:
|
|
31
43
|
cols = line.strip().split("\t")
|
|
32
44
|
protein = cols[0]
|
|
@@ -55,7 +67,8 @@ def get_eggnog(eggnog_annot):
|
|
|
55
67
|
eggnogs = {}
|
|
56
68
|
if not eggnog_annot:
|
|
57
69
|
return eggnogs
|
|
58
|
-
with
|
|
70
|
+
with fileinput.hook_compressed(eggnog_annot, "r", encoding="utf-8") as f:
|
|
71
|
+
|
|
59
72
|
for line in f:
|
|
60
73
|
line = line.rstrip()
|
|
61
74
|
cols = line.split("\t")
|
|
@@ -104,7 +117,8 @@ def get_bgcs(bgc_file, prokka_gff, tool):
|
|
|
104
117
|
return bgc_annotations
|
|
105
118
|
# save positions of each BGC cluster to dictionary cluster_positions
|
|
106
119
|
# and save the annotations to dictionary bgc_result
|
|
107
|
-
with
|
|
120
|
+
with fileinput.hook_compressed(bgc_file, "r", encoding="utf-8") as bgc_in:
|
|
121
|
+
|
|
108
122
|
for line in bgc_in:
|
|
109
123
|
if not line.startswith("#"):
|
|
110
124
|
(
|
|
@@ -138,7 +152,7 @@ def get_bgcs(bgc_file, prokka_gff, tool):
|
|
|
138
152
|
type_value = ""
|
|
139
153
|
as_product = ""
|
|
140
154
|
for a in annotations.split(
|
|
141
|
-
|
|
155
|
+
";"
|
|
142
156
|
): # go through all parts of the annotation field
|
|
143
157
|
if a.startswith("as_type="):
|
|
144
158
|
type_value = a.split("=")[1]
|
|
@@ -170,9 +184,12 @@ def get_bgcs(bgc_file, prokka_gff, tool):
|
|
|
170
184
|
{"bgc_function": type_value},
|
|
171
185
|
)
|
|
172
186
|
if as_product:
|
|
173
|
-
tool_result[contig]["_".join([start_pos, end_pos])][
|
|
187
|
+
tool_result[contig]["_".join([start_pos, end_pos])][
|
|
188
|
+
"bgc_product"
|
|
189
|
+
] = as_product
|
|
174
190
|
# identify CDSs that fall into each of the clusters annotated by the BGC tool
|
|
175
|
-
with
|
|
191
|
+
with fileinput.hook_compressed(prokka_gff, "r", encoding="utf-8") as gff_in:
|
|
192
|
+
|
|
176
193
|
for line in gff_in:
|
|
177
194
|
if not line.startswith("#"):
|
|
178
195
|
matching_interval = ""
|
|
@@ -228,8 +245,9 @@ def get_bgcs(bgc_file, prokka_gff, tool):
|
|
|
228
245
|
},
|
|
229
246
|
)
|
|
230
247
|
if "bgc_product" in tool_result[contig][matching_interval]:
|
|
231
|
-
bgc_annotations[cds_id]["antismash_product"] = tool_result[
|
|
232
|
-
|
|
248
|
+
bgc_annotations[cds_id]["antismash_product"] = tool_result[
|
|
249
|
+
contig
|
|
250
|
+
][matching_interval]["bgc_product"]
|
|
233
251
|
elif line.startswith("##FASTA"):
|
|
234
252
|
break
|
|
235
253
|
return bgc_annotations
|
|
@@ -239,7 +257,7 @@ def get_amr(amr_file):
|
|
|
239
257
|
amr_annotations = {}
|
|
240
258
|
if not amr_file:
|
|
241
259
|
return amr_annotations
|
|
242
|
-
with
|
|
260
|
+
with fileinput.hook_compressed(amr_file, "r", encoding="utf-8") as f:
|
|
243
261
|
for line in f:
|
|
244
262
|
if line.startswith("Protein identifier"):
|
|
245
263
|
continue
|
|
@@ -286,7 +304,7 @@ def get_dbcan(dbcan_file):
|
|
|
286
304
|
substrates = dict()
|
|
287
305
|
if not dbcan_file:
|
|
288
306
|
return dbcan_annotations
|
|
289
|
-
with
|
|
307
|
+
with fileinput.hook_compressed(dbcan_file, "r", encoding="utf-8") as f:
|
|
290
308
|
for line in f:
|
|
291
309
|
if "predicted PUL" in line:
|
|
292
310
|
annot_fields = line.strip().split("\t")[8].split(";")
|
|
@@ -314,13 +332,45 @@ def get_dbcan(dbcan_file):
|
|
|
314
332
|
elif a.startswith("Parent"):
|
|
315
333
|
parent = a.split("=")[1]
|
|
316
334
|
dbcan_annotations[acc] = (
|
|
317
|
-
"dbcan_prot_type={};
|
|
335
|
+
"dbcan_prot_type={};{}={};substrate_dbcan-pul={};substrate_dbcan-sub={}".format(
|
|
318
336
|
prot_type,
|
|
337
|
+
DBCAN_CLASSES_DICT[prot_type],
|
|
319
338
|
prot_fam,
|
|
320
339
|
substrates[parent]["substrate_pul"],
|
|
321
340
|
substrates[parent]["substrate_ecami"],
|
|
322
341
|
)
|
|
323
342
|
)
|
|
343
|
+
|
|
344
|
+
return dbcan_annotations
|
|
345
|
+
|
|
346
|
+
|
|
347
|
+
def get_dbcan_individual_cazys(dbcan_cazys_file):
|
|
348
|
+
dbcan_annotations = dict()
|
|
349
|
+
if not dbcan_cazys_file:
|
|
350
|
+
return dbcan_annotations
|
|
351
|
+
with fileinput.hook_compressed(dbcan_cazys_file, "r", encoding="utf-8") as f:
|
|
352
|
+
for line in f:
|
|
353
|
+
if line.startswith("#"):
|
|
354
|
+
continue
|
|
355
|
+
attributes = line.strip().split("\t")[8]
|
|
356
|
+
attributes_dict = dict(
|
|
357
|
+
re.split(r"(?<!\\)=", item)
|
|
358
|
+
for item in re.split(r"(?<!\\);", attributes.rstrip(";"))
|
|
359
|
+
)
|
|
360
|
+
if "num_tools" in attributes_dict and int(attributes_dict["num_tools"]) < 2:
|
|
361
|
+
continue # don't keep annotations supported by only one tool within dbcan
|
|
362
|
+
cds_pattern = r"\.CDS\d+$"
|
|
363
|
+
protein = re.sub(
|
|
364
|
+
cds_pattern, "", attributes_dict["ID"]
|
|
365
|
+
) # remove the CDS number
|
|
366
|
+
annotation_text = "dbcan_prot_type=CAZyme;"
|
|
367
|
+
for field in ["protein_family", "substrate_dbcan-sub", "eC_number"]:
|
|
368
|
+
if field in attributes_dict:
|
|
369
|
+
annotation_text += (
|
|
370
|
+
f"{'dbcan_prot_family' if field == 'protein_family' else field}"
|
|
371
|
+
f"={attributes_dict[field]};"
|
|
372
|
+
)
|
|
373
|
+
dbcan_annotations[protein] = annotation_text.strip(";")
|
|
324
374
|
return dbcan_annotations
|
|
325
375
|
|
|
326
376
|
|
|
@@ -329,7 +379,8 @@ def get_defense_finder(df_file):
|
|
|
329
379
|
type_info = dict()
|
|
330
380
|
if not df_file:
|
|
331
381
|
return defense_finder_annotations
|
|
332
|
-
with
|
|
382
|
+
with fileinput.hook_compressed(df_file, "r", encoding="utf-8") as f:
|
|
383
|
+
|
|
333
384
|
for line in f:
|
|
334
385
|
if "Anti-phage system" in line:
|
|
335
386
|
annot_fields = line.strip().split("\t")[8].split(";")
|
|
@@ -366,6 +417,7 @@ def load_annotations(
|
|
|
366
417
|
antismash_file,
|
|
367
418
|
gecco_file,
|
|
368
419
|
dbcan_file,
|
|
420
|
+
dbcan_cazys_file,
|
|
369
421
|
defense_finder_file,
|
|
370
422
|
pseudofinder_file,
|
|
371
423
|
):
|
|
@@ -376,6 +428,7 @@ def load_annotations(
|
|
|
376
428
|
antismash_bgcs = get_bgcs(antismash_file, in_gff, tool="antismash")
|
|
377
429
|
amr_annotations = get_amr(amr_file)
|
|
378
430
|
dbcan_annotations = get_dbcan(dbcan_file)
|
|
431
|
+
dbcan_cazys_annotations = get_dbcan_individual_cazys(dbcan_cazys_file)
|
|
379
432
|
defense_finder_annotations = get_defense_finder(defense_finder_file)
|
|
380
433
|
pseudogenes = get_pseudogenes(pseudofinder_file)
|
|
381
434
|
pseudogene_report_dict = dict()
|
|
@@ -384,7 +437,7 @@ def load_annotations(
|
|
|
384
437
|
header = []
|
|
385
438
|
fasta = []
|
|
386
439
|
fasta_flag = False
|
|
387
|
-
with
|
|
440
|
+
with fileinput.hook_compressed(in_gff, "r", encoding="utf-8") as f:
|
|
388
441
|
for line in f:
|
|
389
442
|
line = line.strip()
|
|
390
443
|
if line[0] != "#" and not fasta_flag:
|
|
@@ -496,6 +549,11 @@ def load_annotations(
|
|
|
496
549
|
added_annot[protein]["dbCAN"] = dbcan_annotations[protein]
|
|
497
550
|
except KeyError:
|
|
498
551
|
pass
|
|
552
|
+
try:
|
|
553
|
+
dbcan_cazys_annotations[protein]
|
|
554
|
+
added_annot[protein]["dbCAN"] = dbcan_cazys_annotations[protein]
|
|
555
|
+
except KeyError:
|
|
556
|
+
pass
|
|
499
557
|
try:
|
|
500
558
|
defense_finder_annotations[protein]
|
|
501
559
|
added_annot[protein]["defense_finder"] = (
|
|
@@ -530,7 +588,7 @@ def load_annotations(
|
|
|
530
588
|
def get_ncrnas(ncrnas_file):
|
|
531
589
|
ncrnas = {}
|
|
532
590
|
counts = 0
|
|
533
|
-
with
|
|
591
|
+
with fileinput.hook_compressed(ncrnas_file, "r", encoding="utf-8") as f:
|
|
534
592
|
for line in f:
|
|
535
593
|
if not line.startswith("#"):
|
|
536
594
|
cols = line.strip().split()
|
|
@@ -543,7 +601,9 @@ def get_ncrnas(ncrnas_file):
|
|
|
543
601
|
# Skip tRNAs, we add them from tRNAscan-SE
|
|
544
602
|
continue
|
|
545
603
|
strand = cols[11]
|
|
546
|
-
start, end =
|
|
604
|
+
start, end = int(cols[10]), int(cols[9])
|
|
605
|
+
if strand == "+":
|
|
606
|
+
start, end = end, start
|
|
547
607
|
rna_feature_name, ncrna_class = prepare_rna_gff_fields(cols)
|
|
548
608
|
annot = [
|
|
549
609
|
"ID=" + locus,
|
|
@@ -718,7 +778,10 @@ def prepare_rna_gff_fields(cols):
|
|
|
718
778
|
}
|
|
719
779
|
|
|
720
780
|
if rna_feature_name == "ncRNA":
|
|
721
|
-
ncrna_class = next(
|
|
781
|
+
ncrna_class = next(
|
|
782
|
+
(rna_type for rna_type, rfams in rna_types.items() if cols[2] in rfams),
|
|
783
|
+
None,
|
|
784
|
+
)
|
|
722
785
|
if not ncrna_class:
|
|
723
786
|
if "microRNA" in cols[-1]:
|
|
724
787
|
ncrna_class = "pre_miRNA"
|
|
@@ -729,7 +792,7 @@ def prepare_rna_gff_fields(cols):
|
|
|
729
792
|
|
|
730
793
|
def get_trnas(trnas_file):
|
|
731
794
|
trnas = {}
|
|
732
|
-
with
|
|
795
|
+
with fileinput.hook_compressed(trnas_file, "r", encoding="utf-8") as f:
|
|
733
796
|
for line in f:
|
|
734
797
|
if not line.startswith("#"):
|
|
735
798
|
cols = line.split("\t")
|
|
@@ -738,13 +801,13 @@ def get_trnas(trnas_file):
|
|
|
738
801
|
line = line.replace("tRNAscan-SE", "tRNAscan-SE:2.0.9")
|
|
739
802
|
trnas.setdefault(contig, dict()).setdefault(
|
|
740
803
|
int(start), list()
|
|
741
|
-
).append(line.strip())
|
|
804
|
+
).append(line.strip().strip(";"))
|
|
742
805
|
return trnas
|
|
743
806
|
|
|
744
807
|
|
|
745
808
|
def load_crispr(crispr_file):
|
|
746
809
|
crispr_annotations = dict()
|
|
747
|
-
with
|
|
810
|
+
with fileinput.hook_compressed(crispr_file, "r", encoding="utf-8") as f:
|
|
748
811
|
record = list()
|
|
749
812
|
left_coord = ""
|
|
750
813
|
loc_contig = ""
|
|
@@ -791,7 +854,7 @@ def get_pseudogenes(pseudofinder_file):
|
|
|
791
854
|
pseudogenes = dict()
|
|
792
855
|
if not pseudofinder_file:
|
|
793
856
|
return pseudogenes
|
|
794
|
-
with
|
|
857
|
+
with fileinput.hook_compressed(pseudofinder_file, "r", encoding="utf-8") as file_in:
|
|
795
858
|
for line in file_in:
|
|
796
859
|
if not line.startswith("#"):
|
|
797
860
|
col9 = line.strip().split("\t")[8]
|
|
@@ -28,6 +28,17 @@ def write_results_to_file(
|
|
|
28
28
|
contig_list = check_for_additional_keys(
|
|
29
29
|
ncrnas, trnas, crispr_annotations, contig_list
|
|
30
30
|
)
|
|
31
|
+
# sort contigs by digit at the end of contig/genome accession
|
|
32
|
+
if contig_list[0].startswith(
|
|
33
|
+
"MGYG"
|
|
34
|
+
): # e.g. 'MGYG000500002_1', 'MGYG000500002_2', 'MGYG000500002_3'
|
|
35
|
+
contig_list = sorted(list(contig_list), key=lambda x: int(x.split("_")[-1]))
|
|
36
|
+
elif contig_list[0].startswith(
|
|
37
|
+
"ERZ"
|
|
38
|
+
): # e.g. 'ERZ1049444', 'ERZ1049445', 'ERZ1049446'
|
|
39
|
+
contig_list = sorted(
|
|
40
|
+
list(contig_list), key=lambda x: int(x.split("ERZ")[-1])
|
|
41
|
+
)
|
|
31
42
|
for contig in contig_list:
|
|
32
43
|
sorted_pos_list = sort_positions(
|
|
33
44
|
contig, main_gff_extended, ncrnas, trnas, crispr_annotations
|
|
@@ -17,8 +17,16 @@
|
|
|
17
17
|
|
|
18
18
|
import argparse
|
|
19
19
|
|
|
20
|
-
from gff_annotation_utils import
|
|
21
|
-
|
|
20
|
+
from mgnify_pipelines_toolkit.analysis.assembly.gff_annotation_utils import (
|
|
21
|
+
get_ncrnas,
|
|
22
|
+
get_trnas,
|
|
23
|
+
load_annotations,
|
|
24
|
+
load_crispr,
|
|
25
|
+
)
|
|
26
|
+
from mgnify_pipelines_toolkit.analysis.assembly.gff_file_utils import (
|
|
27
|
+
write_results_to_file,
|
|
28
|
+
print_pseudogene_report,
|
|
29
|
+
)
|
|
22
30
|
|
|
23
31
|
|
|
24
32
|
def main(
|
|
@@ -31,6 +39,7 @@ def main(
|
|
|
31
39
|
antismash_file,
|
|
32
40
|
gecco_file,
|
|
33
41
|
dbcan_file,
|
|
42
|
+
dbcan_cazys_file,
|
|
34
43
|
defense_finder_file,
|
|
35
44
|
pseudofinder_file,
|
|
36
45
|
rfam_file,
|
|
@@ -53,6 +62,7 @@ def main(
|
|
|
53
62
|
antismash_file,
|
|
54
63
|
gecco_file,
|
|
55
64
|
dbcan_file,
|
|
65
|
+
dbcan_cazys_file,
|
|
56
66
|
defense_finder_file,
|
|
57
67
|
pseudofinder_file,
|
|
58
68
|
)
|
|
@@ -66,7 +76,9 @@ def main(
|
|
|
66
76
|
if crispr_file:
|
|
67
77
|
crispr_annotations = load_crispr(crispr_file)
|
|
68
78
|
|
|
69
|
-
write_results_to_file(
|
|
79
|
+
write_results_to_file(
|
|
80
|
+
outfile, header, main_gff_extended, fasta, ncrnas, trnas, crispr_annotations
|
|
81
|
+
)
|
|
70
82
|
if pseudogene_report_file:
|
|
71
83
|
print_pseudogene_report(pseudogene_report_dict, pseudogene_report_file)
|
|
72
84
|
|
|
@@ -74,7 +86,7 @@ def main(
|
|
|
74
86
|
def parse_args():
|
|
75
87
|
parser = argparse.ArgumentParser(
|
|
76
88
|
description="The script extends a user-provided base GFF annotation file by incorporating "
|
|
77
|
-
|
|
89
|
+
"information extracted from the user-provided outputs of supplementary annotation tools.",
|
|
78
90
|
)
|
|
79
91
|
parser.add_argument(
|
|
80
92
|
"-g",
|
|
@@ -124,7 +136,12 @@ def parse_args():
|
|
|
124
136
|
)
|
|
125
137
|
parser.add_argument(
|
|
126
138
|
"--dbcan",
|
|
127
|
-
help="The GFF file produced by dbCAN post-processing script",
|
|
139
|
+
help="The GFF file produced by dbCAN post-processing script that uses cluster annotations",
|
|
140
|
+
required=False,
|
|
141
|
+
)
|
|
142
|
+
parser.add_argument(
|
|
143
|
+
"--dbcan-cazys",
|
|
144
|
+
help="The GFF file produced by dbCAN-CAZYs post-processing script",
|
|
128
145
|
required=False,
|
|
129
146
|
)
|
|
130
147
|
parser.add_argument(
|
|
@@ -149,7 +166,7 @@ def parse_args():
|
|
|
149
166
|
return parser.parse_args()
|
|
150
167
|
|
|
151
168
|
|
|
152
|
-
if __name__ ==
|
|
169
|
+
if __name__ == "__main__":
|
|
153
170
|
args = parse_args()
|
|
154
171
|
main(
|
|
155
172
|
args.gff_input,
|
|
@@ -161,10 +178,11 @@ if __name__ == '__main__':
|
|
|
161
178
|
args.antismash,
|
|
162
179
|
args.gecco,
|
|
163
180
|
args.dbcan,
|
|
181
|
+
args.dbcan_cazys,
|
|
164
182
|
args.defense_finder,
|
|
165
183
|
args.pseudofinder,
|
|
166
184
|
args.rfam,
|
|
167
185
|
args.trnascan,
|
|
168
186
|
args.outfile,
|
|
169
187
|
args.pseudogene_report,
|
|
170
|
-
|
|
188
|
+
)
|
|
@@ -40,10 +40,12 @@ def import_nodes(nodes_dmp):
|
|
|
40
40
|
taxid2rank = {}
|
|
41
41
|
|
|
42
42
|
with open(nodes_dmp) as f1:
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
43
|
+
for line in f1:
|
|
44
|
+
fields = [part.strip() for part in line.split("|")]
|
|
45
|
+
if len(fields) != 14:
|
|
46
|
+
raise ValueError(f"Unexpected number of columns in line: {line}")
|
|
47
|
+
taxid = fields[0]
|
|
48
|
+
rank = fields[2]
|
|
47
49
|
taxid2rank[taxid] = rank
|
|
48
50
|
|
|
49
51
|
return taxid2rank
|
|
@@ -54,11 +56,13 @@ def import_names(names_dmp):
|
|
|
54
56
|
taxid2name = {}
|
|
55
57
|
|
|
56
58
|
with open(names_dmp, newline="") as f1:
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
if
|
|
60
|
-
|
|
61
|
-
|
|
59
|
+
for line in f1:
|
|
60
|
+
fields = [part.strip() for part in line.split("|")]
|
|
61
|
+
if len(fields) != 5:
|
|
62
|
+
raise ValueError(f"Unexpected number of columns in line: {line}")
|
|
63
|
+
if fields[3] == "scientific name":
|
|
64
|
+
taxid = fields[0]
|
|
65
|
+
name = fields[1]
|
|
62
66
|
taxid2name[taxid] = name
|
|
63
67
|
|
|
64
68
|
return taxid2name
|