mgnify-pipelines-toolkit 0.2.0__py3-none-any.whl → 0.2.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mgnify-pipelines-toolkit might be problematic. Click here for more details.
- mgnify_pipelines_toolkit/analysis/amplicon/amplicon_utils.py +30 -37
- mgnify_pipelines_toolkit/analysis/amplicon/are_there_primers.py +3 -3
- mgnify_pipelines_toolkit/analysis/amplicon/assess_mcp_proportions.py +3 -1
- mgnify_pipelines_toolkit/analysis/amplicon/standard_primer_matching.py +1 -1
- mgnify_pipelines_toolkit/analysis/assembly/add_rhea_chebi_annotation.py +41 -38
- mgnify_pipelines_toolkit/analysis/assembly/gff_annotation_utils.py +829 -0
- mgnify_pipelines_toolkit/analysis/assembly/gff_file_utils.py +82 -0
- mgnify_pipelines_toolkit/analysis/assembly/gff_toolkit.py +170 -0
- mgnify_pipelines_toolkit/analysis/shared/markergene_study_summary.py +243 -0
- mgnify_pipelines_toolkit/constants/thresholds.py +7 -0
- {mgnify_pipelines_toolkit-0.2.0.dist-info → mgnify_pipelines_toolkit-0.2.2.dist-info}/METADATA +3 -1
- {mgnify_pipelines_toolkit-0.2.0.dist-info → mgnify_pipelines_toolkit-0.2.2.dist-info}/RECORD +16 -12
- {mgnify_pipelines_toolkit-0.2.0.dist-info → mgnify_pipelines_toolkit-0.2.2.dist-info}/WHEEL +1 -1
- {mgnify_pipelines_toolkit-0.2.0.dist-info → mgnify_pipelines_toolkit-0.2.2.dist-info}/entry_points.txt +2 -1
- {mgnify_pipelines_toolkit-0.2.0.dist-info → mgnify_pipelines_toolkit-0.2.2.dist-info}/LICENSE +0 -0
- {mgnify_pipelines_toolkit-0.2.0.dist-info → mgnify_pipelines_toolkit-0.2.2.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,829 @@
|
|
|
1
|
+
#!/usr/bin/env python
|
|
2
|
+
# -*- coding: utf-8 -*-
|
|
3
|
+
|
|
4
|
+
# Copyright 2025 EMBL - European Bioinformatics Institute
|
|
5
|
+
#
|
|
6
|
+
# Licensed under the Apache License, Version 2.0 (the 'License');
|
|
7
|
+
# you may not use this file except in compliance with the License.
|
|
8
|
+
# You may obtain a copy of the License at
|
|
9
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
10
|
+
#
|
|
11
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
12
|
+
# distributed under the License is distributed on an 'AS IS' BASIS,
|
|
13
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
14
|
+
# See the License for the specific language governing permissions and
|
|
15
|
+
# limitations under the License.
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
import re
|
|
19
|
+
import sys
|
|
20
|
+
|
|
21
|
+
from mgnify_pipelines_toolkit.constants.thresholds import EVALUE_CUTOFF_IPS, EVALUE_CUTOFF_EGGNOG
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def get_iprs(ipr_annot):
|
|
25
|
+
iprs = {}
|
|
26
|
+
antifams = list()
|
|
27
|
+
if not ipr_annot:
|
|
28
|
+
return iprs, antifams
|
|
29
|
+
with open(ipr_annot) as f:
|
|
30
|
+
for line in f:
|
|
31
|
+
cols = line.strip().split("\t")
|
|
32
|
+
protein = cols[0]
|
|
33
|
+
try:
|
|
34
|
+
evalue = float(cols[8])
|
|
35
|
+
except ValueError:
|
|
36
|
+
continue
|
|
37
|
+
if evalue > EVALUE_CUTOFF_IPS:
|
|
38
|
+
continue
|
|
39
|
+
if cols[3] == "AntiFam":
|
|
40
|
+
antifams.append(protein)
|
|
41
|
+
continue
|
|
42
|
+
if protein not in iprs:
|
|
43
|
+
iprs[protein] = [set(), set()]
|
|
44
|
+
if cols[3] == "Pfam":
|
|
45
|
+
pfam = cols[4]
|
|
46
|
+
iprs[protein][0].add(pfam)
|
|
47
|
+
if len(cols) > 12:
|
|
48
|
+
ipr = cols[11]
|
|
49
|
+
if not ipr == "-":
|
|
50
|
+
iprs[protein][1].add(ipr)
|
|
51
|
+
return iprs, antifams
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def get_eggnog(eggnog_annot):
|
|
55
|
+
eggnogs = {}
|
|
56
|
+
if not eggnog_annot:
|
|
57
|
+
return eggnogs
|
|
58
|
+
with open(eggnog_annot, "r") as f:
|
|
59
|
+
for line in f:
|
|
60
|
+
line = line.rstrip()
|
|
61
|
+
cols = line.split("\t")
|
|
62
|
+
if line.startswith("#"):
|
|
63
|
+
eggnog_fields = get_eggnog_fields(line)
|
|
64
|
+
else:
|
|
65
|
+
try:
|
|
66
|
+
evalue = float(cols[2])
|
|
67
|
+
except ValueError:
|
|
68
|
+
continue
|
|
69
|
+
if evalue > EVALUE_CUTOFF_EGGNOG:
|
|
70
|
+
continue
|
|
71
|
+
protein = cols[0]
|
|
72
|
+
eggnog = [cols[1]]
|
|
73
|
+
|
|
74
|
+
cog = list(cols[eggnog_fields["cog_func"]])
|
|
75
|
+
if len(cog) > 1:
|
|
76
|
+
cog = ["R"]
|
|
77
|
+
|
|
78
|
+
kegg = cols[eggnog_fields["KEGG_ko"]].split(",")
|
|
79
|
+
go = cols[eggnog_fields["GOs"]].split(",")
|
|
80
|
+
eggnogs[protein] = [eggnog, cog, kegg, go]
|
|
81
|
+
return eggnogs
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def get_eggnog_fields(line):
|
|
85
|
+
cols = line.strip().split("\t")
|
|
86
|
+
try:
|
|
87
|
+
index_of_go = cols.index("GOs")
|
|
88
|
+
except ValueError:
|
|
89
|
+
sys.exit("Cannot find the GO terms column.")
|
|
90
|
+
if cols[8] == "KEGG_ko" and cols[15] == "CAZy":
|
|
91
|
+
eggnog_fields = {"KEGG_ko": 8, "cog_func": 20, "GOs": index_of_go}
|
|
92
|
+
elif cols[11] == "KEGG_ko" and cols[18] == "CAZy":
|
|
93
|
+
eggnog_fields = {"KEGG_ko": 11, "cog_func": 6, "GOs": index_of_go}
|
|
94
|
+
else:
|
|
95
|
+
sys.exit("Cannot parse eggNOG - unexpected field order or naming")
|
|
96
|
+
return eggnog_fields
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
def get_bgcs(bgc_file, prokka_gff, tool):
|
|
100
|
+
cluster_positions = dict()
|
|
101
|
+
tool_result = dict()
|
|
102
|
+
bgc_annotations = dict()
|
|
103
|
+
if not bgc_file:
|
|
104
|
+
return bgc_annotations
|
|
105
|
+
# save positions of each BGC cluster to dictionary cluster_positions
|
|
106
|
+
# and save the annotations to dictionary bgc_result
|
|
107
|
+
with open(bgc_file, "r") as bgc_in:
|
|
108
|
+
for line in bgc_in:
|
|
109
|
+
if not line.startswith("#"):
|
|
110
|
+
(
|
|
111
|
+
contig,
|
|
112
|
+
_,
|
|
113
|
+
feature,
|
|
114
|
+
start_pos,
|
|
115
|
+
end_pos,
|
|
116
|
+
_,
|
|
117
|
+
_,
|
|
118
|
+
_,
|
|
119
|
+
annotations,
|
|
120
|
+
) = line.strip().split("\t")
|
|
121
|
+
if tool == "sanntis":
|
|
122
|
+
for a in annotations.split(
|
|
123
|
+
";"
|
|
124
|
+
): # go through all parts of the annotation field
|
|
125
|
+
if a.startswith("nearest_MiBIG_class="):
|
|
126
|
+
class_value = a.split("=")[1]
|
|
127
|
+
elif a.startswith("nearest_MiBIG="):
|
|
128
|
+
mibig_value = a.split("=")[1]
|
|
129
|
+
elif tool == "gecco":
|
|
130
|
+
for a in annotations.split(
|
|
131
|
+
";"
|
|
132
|
+
): # go through all parts of the annotation field
|
|
133
|
+
if a.startswith("Type="):
|
|
134
|
+
type_value = a.split("=")[1]
|
|
135
|
+
elif tool == "antismash":
|
|
136
|
+
if feature != "gene":
|
|
137
|
+
continue
|
|
138
|
+
type_value = ""
|
|
139
|
+
as_product = ""
|
|
140
|
+
for a in annotations.split(
|
|
141
|
+
";"
|
|
142
|
+
): # go through all parts of the annotation field
|
|
143
|
+
if a.startswith("as_type="):
|
|
144
|
+
type_value = a.split("=")[1]
|
|
145
|
+
elif a.startswith("as_gene_clusters="):
|
|
146
|
+
as_product = a.split("=")[1]
|
|
147
|
+
# save cluster positions to a dictionary where key = contig name,
|
|
148
|
+
# value = list of position pairs (list of lists)
|
|
149
|
+
cluster_positions.setdefault(contig, list()).append(
|
|
150
|
+
[int(start_pos), int(end_pos)]
|
|
151
|
+
)
|
|
152
|
+
# save BGC annotations to dictionary where key = contig, value = dictionary, where
|
|
153
|
+
# key = 'start_end' of BGC, value = dictionary, where key = feature type, value = description
|
|
154
|
+
if tool == "sanntis":
|
|
155
|
+
tool_result.setdefault(contig, dict()).setdefault(
|
|
156
|
+
"_".join([start_pos, end_pos]),
|
|
157
|
+
{
|
|
158
|
+
"nearest_MiBIG_class": class_value,
|
|
159
|
+
"nearest_MiBIG": mibig_value,
|
|
160
|
+
},
|
|
161
|
+
)
|
|
162
|
+
elif tool == "gecco":
|
|
163
|
+
tool_result.setdefault(contig, dict()).setdefault(
|
|
164
|
+
"_".join([start_pos, end_pos]),
|
|
165
|
+
{"bgc_type": type_value},
|
|
166
|
+
)
|
|
167
|
+
elif tool == "antismash":
|
|
168
|
+
tool_result.setdefault(contig, dict()).setdefault(
|
|
169
|
+
"_".join([start_pos, end_pos]),
|
|
170
|
+
{"bgc_function": type_value},
|
|
171
|
+
)
|
|
172
|
+
if as_product:
|
|
173
|
+
tool_result[contig]["_".join([start_pos, end_pos])]["bgc_product"] = as_product
|
|
174
|
+
# identify CDSs that fall into each of the clusters annotated by the BGC tool
|
|
175
|
+
with open(prokka_gff, "r") as gff_in:
|
|
176
|
+
for line in gff_in:
|
|
177
|
+
if not line.startswith("#"):
|
|
178
|
+
matching_interval = ""
|
|
179
|
+
(
|
|
180
|
+
contig,
|
|
181
|
+
_,
|
|
182
|
+
_,
|
|
183
|
+
start_pos,
|
|
184
|
+
end_pos,
|
|
185
|
+
_,
|
|
186
|
+
_,
|
|
187
|
+
_,
|
|
188
|
+
annotations,
|
|
189
|
+
) = line.strip().split("\t")
|
|
190
|
+
if contig in cluster_positions:
|
|
191
|
+
for i in cluster_positions[contig]:
|
|
192
|
+
if int(start_pos) in range(i[0], i[1] + 1) and int(
|
|
193
|
+
end_pos
|
|
194
|
+
) in range(i[0], i[1] + 1):
|
|
195
|
+
matching_interval = "_".join([str(i[0]), str(i[1])])
|
|
196
|
+
break
|
|
197
|
+
# if the CDS is in an interval, save cluster's annotation to this CDS
|
|
198
|
+
if matching_interval:
|
|
199
|
+
cds_id = annotations.split(";")[0].split("=")[1]
|
|
200
|
+
if tool == "sanntis":
|
|
201
|
+
bgc_annotations.setdefault(
|
|
202
|
+
cds_id,
|
|
203
|
+
{
|
|
204
|
+
"nearest_MiBIG": tool_result[contig][matching_interval][
|
|
205
|
+
"nearest_MiBIG"
|
|
206
|
+
],
|
|
207
|
+
"nearest_MiBIG_class": tool_result[contig][
|
|
208
|
+
matching_interval
|
|
209
|
+
]["nearest_MiBIG_class"],
|
|
210
|
+
},
|
|
211
|
+
)
|
|
212
|
+
elif tool == "gecco":
|
|
213
|
+
bgc_annotations.setdefault(
|
|
214
|
+
cds_id,
|
|
215
|
+
{
|
|
216
|
+
"gecco_bgc_type": tool_result[contig][
|
|
217
|
+
matching_interval
|
|
218
|
+
]["bgc_type"],
|
|
219
|
+
},
|
|
220
|
+
)
|
|
221
|
+
elif tool == "antismash":
|
|
222
|
+
bgc_annotations.setdefault(
|
|
223
|
+
cds_id,
|
|
224
|
+
{
|
|
225
|
+
"antismash_bgc_function": tool_result[contig][
|
|
226
|
+
matching_interval
|
|
227
|
+
]["bgc_function"],
|
|
228
|
+
},
|
|
229
|
+
)
|
|
230
|
+
if "bgc_product" in tool_result[contig][matching_interval]:
|
|
231
|
+
bgc_annotations[cds_id]["antismash_product"] = tool_result[contig][matching_interval][
|
|
232
|
+
"bgc_product"]
|
|
233
|
+
elif line.startswith("##FASTA"):
|
|
234
|
+
break
|
|
235
|
+
return bgc_annotations
|
|
236
|
+
|
|
237
|
+
|
|
238
|
+
def get_amr(amr_file):
|
|
239
|
+
amr_annotations = {}
|
|
240
|
+
if not amr_file:
|
|
241
|
+
return amr_annotations
|
|
242
|
+
with open(amr_file, "r") as f:
|
|
243
|
+
for line in f:
|
|
244
|
+
if line.startswith("Protein identifier"):
|
|
245
|
+
continue
|
|
246
|
+
(
|
|
247
|
+
protein_id,
|
|
248
|
+
_,
|
|
249
|
+
_,
|
|
250
|
+
_,
|
|
251
|
+
_,
|
|
252
|
+
gene_name,
|
|
253
|
+
seq_name,
|
|
254
|
+
scope,
|
|
255
|
+
element_type,
|
|
256
|
+
element_subtype,
|
|
257
|
+
drug_class,
|
|
258
|
+
drug_subclass,
|
|
259
|
+
_,
|
|
260
|
+
) = line.strip().split("\t", 12)
|
|
261
|
+
# don't add annotations for which we don't have a protein ID (these will only be
|
|
262
|
+
# available in the AMRFinderPlus TSV file)
|
|
263
|
+
if protein_id == "NA":
|
|
264
|
+
continue
|
|
265
|
+
# check for characters that could break GFF
|
|
266
|
+
if ";" in seq_name:
|
|
267
|
+
seq_name = seq_name.replace(";", ",")
|
|
268
|
+
if "=" in seq_name:
|
|
269
|
+
seq_name = seq_name.replace("=", " ")
|
|
270
|
+
amr_annotations[protein_id] = ";".join(
|
|
271
|
+
[
|
|
272
|
+
f"amrfinderplus_gene_symbol={gene_name}",
|
|
273
|
+
f"amrfinderplus_sequence_name={seq_name}",
|
|
274
|
+
f"amrfinderplus_scope={scope}",
|
|
275
|
+
f"element_type={element_type}",
|
|
276
|
+
f"element_subtype={element_subtype}",
|
|
277
|
+
f"drug_class={drug_class}",
|
|
278
|
+
f"drug_subclass={drug_subclass}",
|
|
279
|
+
]
|
|
280
|
+
)
|
|
281
|
+
return amr_annotations
|
|
282
|
+
|
|
283
|
+
|
|
284
|
+
def get_dbcan(dbcan_file):
|
|
285
|
+
dbcan_annotations = dict()
|
|
286
|
+
substrates = dict()
|
|
287
|
+
if not dbcan_file:
|
|
288
|
+
return dbcan_annotations
|
|
289
|
+
with open(dbcan_file, "r") as f:
|
|
290
|
+
for line in f:
|
|
291
|
+
if "predicted PUL" in line:
|
|
292
|
+
annot_fields = line.strip().split("\t")[8].split(";")
|
|
293
|
+
for a in annot_fields:
|
|
294
|
+
if a.startswith("ID="):
|
|
295
|
+
cgc = a.split("=")[1]
|
|
296
|
+
elif a.startswith("substrate_dbcan-pul"):
|
|
297
|
+
substrate_pul = a.split("=")[1]
|
|
298
|
+
elif a.startswith("substrate_dbcan-sub"):
|
|
299
|
+
substrate_ecami = a.split("=")[1]
|
|
300
|
+
substrates.setdefault(cgc, {})["substrate_ecami"] = substrate_ecami
|
|
301
|
+
substrates.setdefault(cgc, {})["substrate_pul"] = substrate_pul
|
|
302
|
+
elif line.startswith("#"):
|
|
303
|
+
continue
|
|
304
|
+
else:
|
|
305
|
+
cols = line.strip().split("\t")
|
|
306
|
+
prot_type = cols[2]
|
|
307
|
+
annot_fields = cols[8].split(";")
|
|
308
|
+
if not prot_type == "null":
|
|
309
|
+
for a in annot_fields:
|
|
310
|
+
if a.startswith("ID"):
|
|
311
|
+
acc = a.split("=")[1]
|
|
312
|
+
elif a.startswith("protein_family"):
|
|
313
|
+
prot_fam = a.split("=")[1]
|
|
314
|
+
elif a.startswith("Parent"):
|
|
315
|
+
parent = a.split("=")[1]
|
|
316
|
+
dbcan_annotations[acc] = (
|
|
317
|
+
"dbcan_prot_type={};dbcan_prot_family={};substrate_dbcan-pul={};substrate_dbcan-sub={}".format(
|
|
318
|
+
prot_type,
|
|
319
|
+
prot_fam,
|
|
320
|
+
substrates[parent]["substrate_pul"],
|
|
321
|
+
substrates[parent]["substrate_ecami"],
|
|
322
|
+
)
|
|
323
|
+
)
|
|
324
|
+
return dbcan_annotations
|
|
325
|
+
|
|
326
|
+
|
|
327
|
+
def get_defense_finder(df_file):
|
|
328
|
+
defense_finder_annotations = dict()
|
|
329
|
+
type_info = dict()
|
|
330
|
+
if not df_file:
|
|
331
|
+
return defense_finder_annotations
|
|
332
|
+
with open(df_file, "r") as f:
|
|
333
|
+
for line in f:
|
|
334
|
+
if "Anti-phage system" in line:
|
|
335
|
+
annot_fields = line.strip().split("\t")[8].split(";")
|
|
336
|
+
for a in annot_fields:
|
|
337
|
+
if a.startswith("ID="):
|
|
338
|
+
id = a.split("=")[1]
|
|
339
|
+
elif a.startswith("type"):
|
|
340
|
+
df_type = a.split("=")[1]
|
|
341
|
+
elif a.startswith("subtype"):
|
|
342
|
+
df_subtype = a.split("=")[1]
|
|
343
|
+
type_info.setdefault(id, {})["df_type"] = df_type
|
|
344
|
+
type_info.setdefault(id, {})["df_subtype"] = df_subtype
|
|
345
|
+
elif "DefenseFinder" in line:
|
|
346
|
+
annot_fields = line.strip().split("\t")[8].split(";")
|
|
347
|
+
for a in annot_fields:
|
|
348
|
+
if a.startswith("ID="):
|
|
349
|
+
id = a.split("=")[1]
|
|
350
|
+
elif a.startswith("Parent="):
|
|
351
|
+
parent = a.split("=")[1]
|
|
352
|
+
defense_finder_annotations[id] = (
|
|
353
|
+
"defense_finder_type={};defense_finder_subtype={}".format(
|
|
354
|
+
type_info[parent]["df_type"], type_info[parent]["df_subtype"]
|
|
355
|
+
)
|
|
356
|
+
)
|
|
357
|
+
return defense_finder_annotations
|
|
358
|
+
|
|
359
|
+
|
|
360
|
+
def load_annotations(
|
|
361
|
+
in_gff,
|
|
362
|
+
eggnog_file,
|
|
363
|
+
ipr_file,
|
|
364
|
+
sanntis_file,
|
|
365
|
+
amr_file,
|
|
366
|
+
antismash_file,
|
|
367
|
+
gecco_file,
|
|
368
|
+
dbcan_file,
|
|
369
|
+
defense_finder_file,
|
|
370
|
+
pseudofinder_file,
|
|
371
|
+
):
|
|
372
|
+
eggnogs = get_eggnog(eggnog_file)
|
|
373
|
+
iprs, antifams = get_iprs(ipr_file)
|
|
374
|
+
sanntis_bgcs = get_bgcs(sanntis_file, in_gff, tool="sanntis")
|
|
375
|
+
gecco_bgcs = get_bgcs(gecco_file, in_gff, tool="gecco")
|
|
376
|
+
antismash_bgcs = get_bgcs(antismash_file, in_gff, tool="antismash")
|
|
377
|
+
amr_annotations = get_amr(amr_file)
|
|
378
|
+
dbcan_annotations = get_dbcan(dbcan_file)
|
|
379
|
+
defense_finder_annotations = get_defense_finder(defense_finder_file)
|
|
380
|
+
pseudogenes = get_pseudogenes(pseudofinder_file)
|
|
381
|
+
pseudogene_report_dict = dict()
|
|
382
|
+
added_annot = {}
|
|
383
|
+
main_gff = dict()
|
|
384
|
+
header = []
|
|
385
|
+
fasta = []
|
|
386
|
+
fasta_flag = False
|
|
387
|
+
with open(in_gff) as f:
|
|
388
|
+
for line in f:
|
|
389
|
+
line = line.strip()
|
|
390
|
+
if line[0] != "#" and not fasta_flag:
|
|
391
|
+
line = line.replace("db_xref", "Dbxref")
|
|
392
|
+
line = line.replace(";note=", ";Note=")
|
|
393
|
+
line = line.replace("‘", "'").replace("’", "'")
|
|
394
|
+
cols = line.split("\t")
|
|
395
|
+
if len(cols) == 9:
|
|
396
|
+
contig, caller, feature, start, annot = (
|
|
397
|
+
cols[0],
|
|
398
|
+
cols[1],
|
|
399
|
+
cols[2],
|
|
400
|
+
cols[3],
|
|
401
|
+
cols[8],
|
|
402
|
+
)
|
|
403
|
+
if feature != "CDS":
|
|
404
|
+
if caller == "Bakta" and feature == "region":
|
|
405
|
+
main_gff.setdefault(contig, dict()).setdefault(
|
|
406
|
+
int(start), list()
|
|
407
|
+
).append(line)
|
|
408
|
+
continue
|
|
409
|
+
else:
|
|
410
|
+
continue
|
|
411
|
+
protein = annot.split(";")[0].split("=")[-1]
|
|
412
|
+
if protein in antifams:
|
|
413
|
+
# Don't print to the final GFF proteins that are known to not be real
|
|
414
|
+
continue
|
|
415
|
+
added_annot[protein] = {}
|
|
416
|
+
# process pseudogenes
|
|
417
|
+
if "pseudo=true" in annot.lower():
|
|
418
|
+
# fix case
|
|
419
|
+
cols[8] = annot.replace("pseudo=True", "pseudo=true")
|
|
420
|
+
# gene is already marked as a pseudogene; log it but don't add to the annotation again
|
|
421
|
+
pseudogene_report_dict.setdefault(protein, dict())
|
|
422
|
+
pseudogene_report_dict[protein]["gene_caller"] = True
|
|
423
|
+
if protein in pseudogenes:
|
|
424
|
+
pseudogene_report_dict[protein]["pseudofinder"] = True
|
|
425
|
+
else:
|
|
426
|
+
pseudogene_report_dict[protein]["pseudofinder"] = False
|
|
427
|
+
else:
|
|
428
|
+
# gene caller did not detect this protein as a pseudogene; check if pseudofinder did
|
|
429
|
+
if protein in pseudogenes:
|
|
430
|
+
pseudogene_report_dict.setdefault(protein, dict())
|
|
431
|
+
pseudogene_report_dict[protein]["gene_caller"] = False
|
|
432
|
+
pseudogene_report_dict[protein]["pseudofinder"] = True
|
|
433
|
+
added_annot[protein]["pseudo"] = "true"
|
|
434
|
+
if pseudogenes[protein]:
|
|
435
|
+
cols[8] = add_pseudogene_to_note(
|
|
436
|
+
pseudogenes[protein], cols[8]
|
|
437
|
+
)
|
|
438
|
+
# record antifams
|
|
439
|
+
if protein in antifams:
|
|
440
|
+
pseudogene_report_dict.setdefault(protein, dict())
|
|
441
|
+
pseudogene_report_dict[protein]["antifams"] = True
|
|
442
|
+
try:
|
|
443
|
+
eggnogs[protein]
|
|
444
|
+
pos = 0
|
|
445
|
+
for a in eggnogs[protein]:
|
|
446
|
+
pos += 1
|
|
447
|
+
if a != [""] and a != ["NA"]:
|
|
448
|
+
if pos == 1:
|
|
449
|
+
added_annot[protein]["eggNOG"] = a
|
|
450
|
+
elif pos == 2:
|
|
451
|
+
added_annot[protein]["cog"] = a
|
|
452
|
+
elif pos == 3:
|
|
453
|
+
added_annot[protein]["kegg"] = a
|
|
454
|
+
elif pos == 4:
|
|
455
|
+
added_annot[protein]["Ontology_term"] = a
|
|
456
|
+
except KeyError:
|
|
457
|
+
pass
|
|
458
|
+
try:
|
|
459
|
+
iprs[protein]
|
|
460
|
+
pos = 0
|
|
461
|
+
for a in iprs[protein]:
|
|
462
|
+
pos += 1
|
|
463
|
+
a = list(a)
|
|
464
|
+
if a != [""] and a:
|
|
465
|
+
if pos == 1:
|
|
466
|
+
added_annot[protein]["pfam"] = sorted(a)
|
|
467
|
+
elif pos == 2:
|
|
468
|
+
added_annot[protein]["interpro"] = sorted(a)
|
|
469
|
+
except KeyError:
|
|
470
|
+
pass
|
|
471
|
+
try:
|
|
472
|
+
sanntis_bgcs[protein]
|
|
473
|
+
for key, value in sanntis_bgcs[protein].items():
|
|
474
|
+
added_annot[protein][key] = value
|
|
475
|
+
except KeyError:
|
|
476
|
+
pass
|
|
477
|
+
try:
|
|
478
|
+
gecco_bgcs[protein]
|
|
479
|
+
for key, value in gecco_bgcs[protein].items():
|
|
480
|
+
added_annot[protein][key] = value
|
|
481
|
+
except KeyError:
|
|
482
|
+
pass
|
|
483
|
+
try:
|
|
484
|
+
antismash_bgcs[protein]
|
|
485
|
+
for key, value in antismash_bgcs[protein].items():
|
|
486
|
+
added_annot[protein][key] = value
|
|
487
|
+
except KeyError:
|
|
488
|
+
pass
|
|
489
|
+
try:
|
|
490
|
+
amr_annotations[protein]
|
|
491
|
+
added_annot[protein]["AMR"] = amr_annotations[protein]
|
|
492
|
+
except KeyError:
|
|
493
|
+
pass
|
|
494
|
+
try:
|
|
495
|
+
dbcan_annotations[protein]
|
|
496
|
+
added_annot[protein]["dbCAN"] = dbcan_annotations[protein]
|
|
497
|
+
except KeyError:
|
|
498
|
+
pass
|
|
499
|
+
try:
|
|
500
|
+
defense_finder_annotations[protein]
|
|
501
|
+
added_annot[protein]["defense_finder"] = (
|
|
502
|
+
defense_finder_annotations[protein]
|
|
503
|
+
)
|
|
504
|
+
except KeyError:
|
|
505
|
+
pass
|
|
506
|
+
for a in added_annot[protein]:
|
|
507
|
+
value = added_annot[protein][a]
|
|
508
|
+
if type(value) is list:
|
|
509
|
+
value = ",".join(value)
|
|
510
|
+
if a in ["AMR", "dbCAN", "defense_finder"]:
|
|
511
|
+
cols[8] = f"{cols[8]};{value}"
|
|
512
|
+
else:
|
|
513
|
+
if not value == "-":
|
|
514
|
+
cols[8] = f"{cols[8]};{a}={value}"
|
|
515
|
+
line = "\t".join(cols)
|
|
516
|
+
main_gff.setdefault(contig, dict()).setdefault(
|
|
517
|
+
int(start), list()
|
|
518
|
+
).append(line)
|
|
519
|
+
elif line.startswith("#"):
|
|
520
|
+
if line == "##FASTA":
|
|
521
|
+
fasta_flag = True
|
|
522
|
+
fasta.append(line)
|
|
523
|
+
else:
|
|
524
|
+
header.append(line)
|
|
525
|
+
elif fasta_flag:
|
|
526
|
+
fasta.append(line)
|
|
527
|
+
return header, main_gff, fasta, pseudogene_report_dict
|
|
528
|
+
|
|
529
|
+
|
|
530
|
+
def get_ncrnas(ncrnas_file):
|
|
531
|
+
ncrnas = {}
|
|
532
|
+
counts = 0
|
|
533
|
+
with open(ncrnas_file, "r") as f:
|
|
534
|
+
for line in f:
|
|
535
|
+
if not line.startswith("#"):
|
|
536
|
+
cols = line.strip().split()
|
|
537
|
+
counts += 1
|
|
538
|
+
contig = cols[3]
|
|
539
|
+
locus = f"{contig}_ncRNA{counts}"
|
|
540
|
+
product = " ".join(cols[28:])
|
|
541
|
+
model = cols[2]
|
|
542
|
+
if model == "RF00005":
|
|
543
|
+
# Skip tRNAs, we add them from tRNAscan-SE
|
|
544
|
+
continue
|
|
545
|
+
strand = cols[11]
|
|
546
|
+
start, end = (int(cols[9]), int(cols[10])) if strand == "+" else (int(cols[10]), int(cols[9]))
|
|
547
|
+
rna_feature_name, ncrna_class = prepare_rna_gff_fields(cols)
|
|
548
|
+
annot = [
|
|
549
|
+
"ID=" + locus,
|
|
550
|
+
"inference=Rfam:14.9",
|
|
551
|
+
"locus_tag=" + locus,
|
|
552
|
+
"product=" + product,
|
|
553
|
+
"rfam=" + model,
|
|
554
|
+
]
|
|
555
|
+
if ncrna_class:
|
|
556
|
+
annot.append(f"ncRNA_class={ncrna_class}")
|
|
557
|
+
annot = ";".join(annot)
|
|
558
|
+
newline = "\t".join(
|
|
559
|
+
[
|
|
560
|
+
contig,
|
|
561
|
+
"INFERNAL:1.1.4",
|
|
562
|
+
rna_feature_name,
|
|
563
|
+
str(start),
|
|
564
|
+
str(end),
|
|
565
|
+
".",
|
|
566
|
+
strand,
|
|
567
|
+
".",
|
|
568
|
+
annot,
|
|
569
|
+
]
|
|
570
|
+
)
|
|
571
|
+
ncrnas.setdefault(contig, dict()).setdefault(start, list()).append(
|
|
572
|
+
newline
|
|
573
|
+
)
|
|
574
|
+
return ncrnas
|
|
575
|
+
|
|
576
|
+
|
|
577
|
+
def prepare_rna_gff_fields(cols):
|
|
578
|
+
rna_feature_name = "ncRNA"
|
|
579
|
+
if cols[1] in ["LSU_rRNA_bacteria", "SSU_rRNA_bacteria", "5S_rRNA"]:
|
|
580
|
+
rna_feature_name = "rRNA"
|
|
581
|
+
ncrna_class = ""
|
|
582
|
+
rna_types = {
|
|
583
|
+
"antisense_RNA": [
|
|
584
|
+
"RF00039",
|
|
585
|
+
"RF00042",
|
|
586
|
+
"RF00057",
|
|
587
|
+
"RF00106",
|
|
588
|
+
"RF00107",
|
|
589
|
+
"RF00236",
|
|
590
|
+
"RF00238",
|
|
591
|
+
"RF00240",
|
|
592
|
+
"RF00242",
|
|
593
|
+
"RF00262",
|
|
594
|
+
"RF00388",
|
|
595
|
+
"RF00489",
|
|
596
|
+
"RF01695",
|
|
597
|
+
"RF01794",
|
|
598
|
+
"RF01797",
|
|
599
|
+
"RF01809",
|
|
600
|
+
"RF01813",
|
|
601
|
+
"RF02194",
|
|
602
|
+
"RF02235",
|
|
603
|
+
"RF02236",
|
|
604
|
+
"RF02237",
|
|
605
|
+
"RF02238",
|
|
606
|
+
"RF02239",
|
|
607
|
+
"RF02519",
|
|
608
|
+
"RF02550",
|
|
609
|
+
"RF02558",
|
|
610
|
+
"RF02559",
|
|
611
|
+
"RF02560",
|
|
612
|
+
"RF02563",
|
|
613
|
+
"RF02592",
|
|
614
|
+
"RF02662",
|
|
615
|
+
"RF02674",
|
|
616
|
+
"RF02735",
|
|
617
|
+
"RF02743",
|
|
618
|
+
"RF02792",
|
|
619
|
+
"RF02793",
|
|
620
|
+
"RF02812",
|
|
621
|
+
"RF02818",
|
|
622
|
+
"RF02819",
|
|
623
|
+
"RF02820",
|
|
624
|
+
"RF02839",
|
|
625
|
+
"RF02843",
|
|
626
|
+
"RF02844",
|
|
627
|
+
"RF02846",
|
|
628
|
+
"RF02850",
|
|
629
|
+
"RF02851",
|
|
630
|
+
"RF02855",
|
|
631
|
+
"RF02873",
|
|
632
|
+
"RF02874",
|
|
633
|
+
"RF02875",
|
|
634
|
+
"RF02876",
|
|
635
|
+
"RF02891",
|
|
636
|
+
"RF02892",
|
|
637
|
+
"RF02903",
|
|
638
|
+
"RF02908",
|
|
639
|
+
],
|
|
640
|
+
"autocatalytically_spliced_intron": ["RF01807"],
|
|
641
|
+
"ribozyme": [
|
|
642
|
+
"RF00621",
|
|
643
|
+
"RF01787",
|
|
644
|
+
"RF01788",
|
|
645
|
+
"RF01865",
|
|
646
|
+
"RF02678",
|
|
647
|
+
"RF02679",
|
|
648
|
+
"RF02681",
|
|
649
|
+
"RF02682",
|
|
650
|
+
"RF02684",
|
|
651
|
+
"RF03154",
|
|
652
|
+
"RF03160",
|
|
653
|
+
"RF04188",
|
|
654
|
+
],
|
|
655
|
+
"hammerhead_ribozyme": [
|
|
656
|
+
"RF00008",
|
|
657
|
+
"RF00163",
|
|
658
|
+
"RF02275",
|
|
659
|
+
"RF02276",
|
|
660
|
+
"RF02277",
|
|
661
|
+
"RF03152",
|
|
662
|
+
],
|
|
663
|
+
"RNase_P_RNA": [
|
|
664
|
+
"RF00009",
|
|
665
|
+
"RF00010",
|
|
666
|
+
"RF00011",
|
|
667
|
+
"RF00373",
|
|
668
|
+
"RF01577",
|
|
669
|
+
"RF02357",
|
|
670
|
+
],
|
|
671
|
+
"RNase_MRP_RNA": ["RF00030", "RF02472"],
|
|
672
|
+
"telomerase_RNA": ["RF00024", "RF00025", "RF01050", "RF02462"],
|
|
673
|
+
"scaRNA": [
|
|
674
|
+
"RF00231",
|
|
675
|
+
"RF00283",
|
|
676
|
+
"RF00286",
|
|
677
|
+
"RF00422",
|
|
678
|
+
"RF00423",
|
|
679
|
+
"RF00424",
|
|
680
|
+
"RF00426",
|
|
681
|
+
"RF00427",
|
|
682
|
+
"RF00478",
|
|
683
|
+
"RF00492",
|
|
684
|
+
"RF00553",
|
|
685
|
+
"RF00564",
|
|
686
|
+
"RF00565",
|
|
687
|
+
"RF00582",
|
|
688
|
+
"RF00601",
|
|
689
|
+
"RF00602",
|
|
690
|
+
"RF01268",
|
|
691
|
+
"RF01295",
|
|
692
|
+
"RF02665",
|
|
693
|
+
"RF02666",
|
|
694
|
+
"RF02667",
|
|
695
|
+
"RF02668",
|
|
696
|
+
"RF02669",
|
|
697
|
+
"RF02670",
|
|
698
|
+
"RF02718",
|
|
699
|
+
"RF02719",
|
|
700
|
+
"RF02720",
|
|
701
|
+
"RF02721",
|
|
702
|
+
"RF02722",
|
|
703
|
+
],
|
|
704
|
+
"snRNA": ["RF01802"],
|
|
705
|
+
"SRP_RNA": [
|
|
706
|
+
"RF00017",
|
|
707
|
+
"RF00169",
|
|
708
|
+
"RF01502",
|
|
709
|
+
"RF01570",
|
|
710
|
+
"RF01854",
|
|
711
|
+
"RF01855",
|
|
712
|
+
"RF01856",
|
|
713
|
+
"RF01857",
|
|
714
|
+
"RF04183",
|
|
715
|
+
],
|
|
716
|
+
"vault_RNA": ["RF00006"],
|
|
717
|
+
"Y_RNA": ["RF00019", "RF02553", "RF01053", "RF02565"],
|
|
718
|
+
}
|
|
719
|
+
|
|
720
|
+
if rna_feature_name == "ncRNA":
|
|
721
|
+
ncrna_class = next((rna_type for rna_type, rfams in rna_types.items() if cols[2] in rfams), None)
|
|
722
|
+
if not ncrna_class:
|
|
723
|
+
if "microRNA" in cols[-1]:
|
|
724
|
+
ncrna_class = "pre_miRNA"
|
|
725
|
+
else:
|
|
726
|
+
ncrna_class = "other"
|
|
727
|
+
return rna_feature_name, ncrna_class
|
|
728
|
+
|
|
729
|
+
|
|
730
|
+
def get_trnas(trnas_file):
|
|
731
|
+
trnas = {}
|
|
732
|
+
with open(trnas_file, "r") as f:
|
|
733
|
+
for line in f:
|
|
734
|
+
if not line.startswith("#"):
|
|
735
|
+
cols = line.split("\t")
|
|
736
|
+
contig, feature, start = cols[0], cols[2], cols[3]
|
|
737
|
+
if feature == "tRNA":
|
|
738
|
+
line = line.replace("tRNAscan-SE", "tRNAscan-SE:2.0.9")
|
|
739
|
+
trnas.setdefault(contig, dict()).setdefault(
|
|
740
|
+
int(start), list()
|
|
741
|
+
).append(line.strip())
|
|
742
|
+
return trnas
|
|
743
|
+
|
|
744
|
+
|
|
745
|
+
def load_crispr(crispr_file):
|
|
746
|
+
crispr_annotations = dict()
|
|
747
|
+
with open(crispr_file, "r") as f:
|
|
748
|
+
record = list()
|
|
749
|
+
left_coord = ""
|
|
750
|
+
loc_contig = ""
|
|
751
|
+
previous_end = ""
|
|
752
|
+
for line in f:
|
|
753
|
+
if not line.startswith("#"):
|
|
754
|
+
cols = line.strip().split("\t")
|
|
755
|
+
contig, _, start, end = (
|
|
756
|
+
cols[0],
|
|
757
|
+
cols[2],
|
|
758
|
+
int(cols[3]),
|
|
759
|
+
int(cols[4]),
|
|
760
|
+
)
|
|
761
|
+
if (
|
|
762
|
+
len(record) > 0
|
|
763
|
+
and contig == loc_contig
|
|
764
|
+
and abs(start - previous_end) < 2
|
|
765
|
+
):
|
|
766
|
+
# the line is a continuation of an existing record
|
|
767
|
+
record.append(line)
|
|
768
|
+
previous_end = end
|
|
769
|
+
elif len(record) == 0:
|
|
770
|
+
record.append(line)
|
|
771
|
+
left_coord = start
|
|
772
|
+
loc_contig = contig
|
|
773
|
+
previous_end = end
|
|
774
|
+
else:
|
|
775
|
+
# the previous record is complete, started reading a new record
|
|
776
|
+
crispr_annotations.setdefault(contig, dict()).setdefault(
|
|
777
|
+
left_coord, list()
|
|
778
|
+
).append(record)
|
|
779
|
+
record = list()
|
|
780
|
+
record.append(line)
|
|
781
|
+
previous_end = end
|
|
782
|
+
left_coord = start
|
|
783
|
+
if len(record) > 0:
|
|
784
|
+
crispr_annotations.setdefault(contig, dict()).setdefault(
|
|
785
|
+
left_coord, list()
|
|
786
|
+
).append(record)
|
|
787
|
+
return crispr_annotations
|
|
788
|
+
|
|
789
|
+
|
|
790
|
+
def get_pseudogenes(pseudofinder_file):
|
|
791
|
+
pseudogenes = dict()
|
|
792
|
+
if not pseudofinder_file:
|
|
793
|
+
return pseudogenes
|
|
794
|
+
with open(pseudofinder_file) as file_in:
|
|
795
|
+
for line in file_in:
|
|
796
|
+
if not line.startswith("#"):
|
|
797
|
+
col9 = line.strip().split("\t")[8]
|
|
798
|
+
attributes_dict = dict(
|
|
799
|
+
re.split(r"(?<!\\)=", item) for item in re.split(r"(?<!\\);", col9)
|
|
800
|
+
)
|
|
801
|
+
if "note" in attributes_dict:
|
|
802
|
+
note = attributes_dict["note"]
|
|
803
|
+
else:
|
|
804
|
+
note = ""
|
|
805
|
+
if "old_locus_tag" in attributes_dict:
|
|
806
|
+
tags = attributes_dict["old_locus_tag"].split(",")
|
|
807
|
+
for tag in tags:
|
|
808
|
+
if "_ign_" not in tag:
|
|
809
|
+
pseudogenes[tag] = note
|
|
810
|
+
return pseudogenes
|
|
811
|
+
|
|
812
|
+
|
|
813
|
+
def add_pseudogene_to_note(note_text, col9):
|
|
814
|
+
col9_dict = dict(
|
|
815
|
+
re.split(r"(?<!\\)=", item) for item in re.split(r"(?<!\\);", col9)
|
|
816
|
+
)
|
|
817
|
+
if "Note" in col9_dict.keys():
|
|
818
|
+
col9_dict["Note"] = col9_dict["Note"] + f", {note_text}"
|
|
819
|
+
return ";".join([f"{key}={value}" for key, value in col9_dict.items()])
|
|
820
|
+
else:
|
|
821
|
+
# insert note after locus tag
|
|
822
|
+
keys_list = list(col9_dict.keys())
|
|
823
|
+
locus_tag_index = keys_list.index("locus_tag")
|
|
824
|
+
new_dict = (
|
|
825
|
+
{k: col9_dict[k] for k in keys_list[: locus_tag_index + 1]}
|
|
826
|
+
| {"Note": note_text}
|
|
827
|
+
| {k: col9_dict[k] for k in keys_list[locus_tag_index + 1 :]}
|
|
828
|
+
)
|
|
829
|
+
return ";".join([f"{key}={value}" for key, value in new_dict.items()])
|