mgnify-pipelines-toolkit 1.0.4__py3-none-any.whl → 1.0.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mgnify-pipelines-toolkit might be problematic. Click here for more details.
- mgnify_pipelines_toolkit/analysis/assembly/gff_annotation_utils.py +84 -21
- mgnify_pipelines_toolkit/analysis/assembly/gff_file_utils.py +11 -0
- mgnify_pipelines_toolkit/analysis/assembly/gff_toolkit.py +25 -7
- mgnify_pipelines_toolkit/analysis/assembly/process_dbcan_result_cazys.py +211 -0
- mgnify_pipelines_toolkit/analysis/assembly/process_dbcan_result_clusters.py +162 -0
- mgnify_pipelines_toolkit/analysis/assembly/summarise_antismash_bgcs.py +230 -0
- mgnify_pipelines_toolkit/analysis/assembly/summarise_sanntis_bgcs.py +119 -0
- mgnify_pipelines_toolkit/analysis/shared/convert_cmscan_to_cmsearch_tblout.py +6 -3
- mgnify_pipelines_toolkit/constants/thresholds.py +1 -1
- {mgnify_pipelines_toolkit-1.0.4.dist-info → mgnify_pipelines_toolkit-1.0.5.dist-info}/METADATA +1 -1
- {mgnify_pipelines_toolkit-1.0.4.dist-info → mgnify_pipelines_toolkit-1.0.5.dist-info}/RECORD +15 -11
- {mgnify_pipelines_toolkit-1.0.4.dist-info → mgnify_pipelines_toolkit-1.0.5.dist-info}/entry_points.txt +5 -0
- {mgnify_pipelines_toolkit-1.0.4.dist-info → mgnify_pipelines_toolkit-1.0.5.dist-info}/WHEEL +0 -0
- {mgnify_pipelines_toolkit-1.0.4.dist-info → mgnify_pipelines_toolkit-1.0.5.dist-info}/licenses/LICENSE +0 -0
- {mgnify_pipelines_toolkit-1.0.4.dist-info → mgnify_pipelines_toolkit-1.0.5.dist-info}/top_level.txt +0 -0
|
@@ -17,8 +17,19 @@
|
|
|
17
17
|
|
|
18
18
|
import re
|
|
19
19
|
import sys
|
|
20
|
+
import fileinput
|
|
20
21
|
|
|
21
|
-
from mgnify_pipelines_toolkit.constants.thresholds import
|
|
22
|
+
from mgnify_pipelines_toolkit.constants.thresholds import (
|
|
23
|
+
EVALUE_CUTOFF_IPS,
|
|
24
|
+
EVALUE_CUTOFF_EGGNOG,
|
|
25
|
+
)
|
|
26
|
+
|
|
27
|
+
DBCAN_CLASSES_DICT = {
|
|
28
|
+
"TC": "dbcan_transporter_classification",
|
|
29
|
+
"TF": "dbcan_transcription_factor",
|
|
30
|
+
"STP": "dbcan_signal_transduction_prot",
|
|
31
|
+
"CAZyme": "dbcan_prot_family",
|
|
32
|
+
}
|
|
22
33
|
|
|
23
34
|
|
|
24
35
|
def get_iprs(ipr_annot):
|
|
@@ -26,7 +37,8 @@ def get_iprs(ipr_annot):
|
|
|
26
37
|
antifams = list()
|
|
27
38
|
if not ipr_annot:
|
|
28
39
|
return iprs, antifams
|
|
29
|
-
with
|
|
40
|
+
with fileinput.hook_compressed(ipr_annot, "r", encoding="utf-8") as f:
|
|
41
|
+
|
|
30
42
|
for line in f:
|
|
31
43
|
cols = line.strip().split("\t")
|
|
32
44
|
protein = cols[0]
|
|
@@ -55,7 +67,8 @@ def get_eggnog(eggnog_annot):
|
|
|
55
67
|
eggnogs = {}
|
|
56
68
|
if not eggnog_annot:
|
|
57
69
|
return eggnogs
|
|
58
|
-
with
|
|
70
|
+
with fileinput.hook_compressed(eggnog_annot, "r", encoding="utf-8") as f:
|
|
71
|
+
|
|
59
72
|
for line in f:
|
|
60
73
|
line = line.rstrip()
|
|
61
74
|
cols = line.split("\t")
|
|
@@ -104,7 +117,8 @@ def get_bgcs(bgc_file, prokka_gff, tool):
|
|
|
104
117
|
return bgc_annotations
|
|
105
118
|
# save positions of each BGC cluster to dictionary cluster_positions
|
|
106
119
|
# and save the annotations to dictionary bgc_result
|
|
107
|
-
with
|
|
120
|
+
with fileinput.hook_compressed(bgc_file, "r", encoding="utf-8") as bgc_in:
|
|
121
|
+
|
|
108
122
|
for line in bgc_in:
|
|
109
123
|
if not line.startswith("#"):
|
|
110
124
|
(
|
|
@@ -138,7 +152,7 @@ def get_bgcs(bgc_file, prokka_gff, tool):
|
|
|
138
152
|
type_value = ""
|
|
139
153
|
as_product = ""
|
|
140
154
|
for a in annotations.split(
|
|
141
|
-
|
|
155
|
+
";"
|
|
142
156
|
): # go through all parts of the annotation field
|
|
143
157
|
if a.startswith("as_type="):
|
|
144
158
|
type_value = a.split("=")[1]
|
|
@@ -170,9 +184,12 @@ def get_bgcs(bgc_file, prokka_gff, tool):
|
|
|
170
184
|
{"bgc_function": type_value},
|
|
171
185
|
)
|
|
172
186
|
if as_product:
|
|
173
|
-
tool_result[contig]["_".join([start_pos, end_pos])][
|
|
187
|
+
tool_result[contig]["_".join([start_pos, end_pos])][
|
|
188
|
+
"bgc_product"
|
|
189
|
+
] = as_product
|
|
174
190
|
# identify CDSs that fall into each of the clusters annotated by the BGC tool
|
|
175
|
-
with
|
|
191
|
+
with fileinput.hook_compressed(prokka_gff, "r", encoding="utf-8") as gff_in:
|
|
192
|
+
|
|
176
193
|
for line in gff_in:
|
|
177
194
|
if not line.startswith("#"):
|
|
178
195
|
matching_interval = ""
|
|
@@ -228,8 +245,9 @@ def get_bgcs(bgc_file, prokka_gff, tool):
|
|
|
228
245
|
},
|
|
229
246
|
)
|
|
230
247
|
if "bgc_product" in tool_result[contig][matching_interval]:
|
|
231
|
-
bgc_annotations[cds_id]["antismash_product"] = tool_result[
|
|
232
|
-
|
|
248
|
+
bgc_annotations[cds_id]["antismash_product"] = tool_result[
|
|
249
|
+
contig
|
|
250
|
+
][matching_interval]["bgc_product"]
|
|
233
251
|
elif line.startswith("##FASTA"):
|
|
234
252
|
break
|
|
235
253
|
return bgc_annotations
|
|
@@ -239,7 +257,7 @@ def get_amr(amr_file):
|
|
|
239
257
|
amr_annotations = {}
|
|
240
258
|
if not amr_file:
|
|
241
259
|
return amr_annotations
|
|
242
|
-
with
|
|
260
|
+
with fileinput.hook_compressed(amr_file, "r", encoding="utf-8") as f:
|
|
243
261
|
for line in f:
|
|
244
262
|
if line.startswith("Protein identifier"):
|
|
245
263
|
continue
|
|
@@ -286,7 +304,7 @@ def get_dbcan(dbcan_file):
|
|
|
286
304
|
substrates = dict()
|
|
287
305
|
if not dbcan_file:
|
|
288
306
|
return dbcan_annotations
|
|
289
|
-
with
|
|
307
|
+
with fileinput.hook_compressed(dbcan_file, "r", encoding="utf-8") as f:
|
|
290
308
|
for line in f:
|
|
291
309
|
if "predicted PUL" in line:
|
|
292
310
|
annot_fields = line.strip().split("\t")[8].split(";")
|
|
@@ -314,13 +332,45 @@ def get_dbcan(dbcan_file):
|
|
|
314
332
|
elif a.startswith("Parent"):
|
|
315
333
|
parent = a.split("=")[1]
|
|
316
334
|
dbcan_annotations[acc] = (
|
|
317
|
-
"dbcan_prot_type={};
|
|
335
|
+
"dbcan_prot_type={};{}={};substrate_dbcan-pul={};substrate_dbcan-sub={}".format(
|
|
318
336
|
prot_type,
|
|
337
|
+
DBCAN_CLASSES_DICT[prot_type],
|
|
319
338
|
prot_fam,
|
|
320
339
|
substrates[parent]["substrate_pul"],
|
|
321
340
|
substrates[parent]["substrate_ecami"],
|
|
322
341
|
)
|
|
323
342
|
)
|
|
343
|
+
|
|
344
|
+
return dbcan_annotations
|
|
345
|
+
|
|
346
|
+
|
|
347
|
+
def get_dbcan_individual_cazys(dbcan_cazys_file):
|
|
348
|
+
dbcan_annotations = dict()
|
|
349
|
+
if not dbcan_cazys_file:
|
|
350
|
+
return dbcan_annotations
|
|
351
|
+
with fileinput.hook_compressed(dbcan_cazys_file, "r", encoding="utf-8") as f:
|
|
352
|
+
for line in f:
|
|
353
|
+
if line.startswith("#"):
|
|
354
|
+
continue
|
|
355
|
+
attributes = line.strip().split("\t")[8]
|
|
356
|
+
attributes_dict = dict(
|
|
357
|
+
re.split(r"(?<!\\)=", item)
|
|
358
|
+
for item in re.split(r"(?<!\\);", attributes.rstrip(";"))
|
|
359
|
+
)
|
|
360
|
+
if "num_tools" in attributes_dict and int(attributes_dict["num_tools"]) < 2:
|
|
361
|
+
continue # don't keep annotations supported by only one tool within dbcan
|
|
362
|
+
cds_pattern = r"\.CDS\d+$"
|
|
363
|
+
protein = re.sub(
|
|
364
|
+
cds_pattern, "", attributes_dict["ID"]
|
|
365
|
+
) # remove the CDS number
|
|
366
|
+
annotation_text = "dbcan_prot_type=CAZyme;"
|
|
367
|
+
for field in ["protein_family", "substrate_dbcan-sub", "eC_number"]:
|
|
368
|
+
if field in attributes_dict:
|
|
369
|
+
annotation_text += (
|
|
370
|
+
f"{'dbcan_prot_family' if field == 'protein_family' else field}"
|
|
371
|
+
f"={attributes_dict[field]};"
|
|
372
|
+
)
|
|
373
|
+
dbcan_annotations[protein] = annotation_text.strip(";")
|
|
324
374
|
return dbcan_annotations
|
|
325
375
|
|
|
326
376
|
|
|
@@ -329,7 +379,8 @@ def get_defense_finder(df_file):
|
|
|
329
379
|
type_info = dict()
|
|
330
380
|
if not df_file:
|
|
331
381
|
return defense_finder_annotations
|
|
332
|
-
with
|
|
382
|
+
with fileinput.hook_compressed(df_file, "r", encoding="utf-8") as f:
|
|
383
|
+
|
|
333
384
|
for line in f:
|
|
334
385
|
if "Anti-phage system" in line:
|
|
335
386
|
annot_fields = line.strip().split("\t")[8].split(";")
|
|
@@ -366,6 +417,7 @@ def load_annotations(
|
|
|
366
417
|
antismash_file,
|
|
367
418
|
gecco_file,
|
|
368
419
|
dbcan_file,
|
|
420
|
+
dbcan_cazys_file,
|
|
369
421
|
defense_finder_file,
|
|
370
422
|
pseudofinder_file,
|
|
371
423
|
):
|
|
@@ -376,6 +428,7 @@ def load_annotations(
|
|
|
376
428
|
antismash_bgcs = get_bgcs(antismash_file, in_gff, tool="antismash")
|
|
377
429
|
amr_annotations = get_amr(amr_file)
|
|
378
430
|
dbcan_annotations = get_dbcan(dbcan_file)
|
|
431
|
+
dbcan_cazys_annotations = get_dbcan_individual_cazys(dbcan_cazys_file)
|
|
379
432
|
defense_finder_annotations = get_defense_finder(defense_finder_file)
|
|
380
433
|
pseudogenes = get_pseudogenes(pseudofinder_file)
|
|
381
434
|
pseudogene_report_dict = dict()
|
|
@@ -384,7 +437,7 @@ def load_annotations(
|
|
|
384
437
|
header = []
|
|
385
438
|
fasta = []
|
|
386
439
|
fasta_flag = False
|
|
387
|
-
with
|
|
440
|
+
with fileinput.hook_compressed(in_gff, "r", encoding="utf-8") as f:
|
|
388
441
|
for line in f:
|
|
389
442
|
line = line.strip()
|
|
390
443
|
if line[0] != "#" and not fasta_flag:
|
|
@@ -496,6 +549,11 @@ def load_annotations(
|
|
|
496
549
|
added_annot[protein]["dbCAN"] = dbcan_annotations[protein]
|
|
497
550
|
except KeyError:
|
|
498
551
|
pass
|
|
552
|
+
try:
|
|
553
|
+
dbcan_cazys_annotations[protein]
|
|
554
|
+
added_annot[protein]["dbCAN"] = dbcan_cazys_annotations[protein]
|
|
555
|
+
except KeyError:
|
|
556
|
+
pass
|
|
499
557
|
try:
|
|
500
558
|
defense_finder_annotations[protein]
|
|
501
559
|
added_annot[protein]["defense_finder"] = (
|
|
@@ -530,7 +588,7 @@ def load_annotations(
|
|
|
530
588
|
def get_ncrnas(ncrnas_file):
|
|
531
589
|
ncrnas = {}
|
|
532
590
|
counts = 0
|
|
533
|
-
with
|
|
591
|
+
with fileinput.hook_compressed(ncrnas_file, "r", encoding="utf-8") as f:
|
|
534
592
|
for line in f:
|
|
535
593
|
if not line.startswith("#"):
|
|
536
594
|
cols = line.strip().split()
|
|
@@ -543,7 +601,9 @@ def get_ncrnas(ncrnas_file):
|
|
|
543
601
|
# Skip tRNAs, we add them from tRNAscan-SE
|
|
544
602
|
continue
|
|
545
603
|
strand = cols[11]
|
|
546
|
-
start, end =
|
|
604
|
+
start, end = int(cols[10]), int(cols[9])
|
|
605
|
+
if strand == "+":
|
|
606
|
+
start, end = end, start
|
|
547
607
|
rna_feature_name, ncrna_class = prepare_rna_gff_fields(cols)
|
|
548
608
|
annot = [
|
|
549
609
|
"ID=" + locus,
|
|
@@ -718,7 +778,10 @@ def prepare_rna_gff_fields(cols):
|
|
|
718
778
|
}
|
|
719
779
|
|
|
720
780
|
if rna_feature_name == "ncRNA":
|
|
721
|
-
ncrna_class = next(
|
|
781
|
+
ncrna_class = next(
|
|
782
|
+
(rna_type for rna_type, rfams in rna_types.items() if cols[2] in rfams),
|
|
783
|
+
None,
|
|
784
|
+
)
|
|
722
785
|
if not ncrna_class:
|
|
723
786
|
if "microRNA" in cols[-1]:
|
|
724
787
|
ncrna_class = "pre_miRNA"
|
|
@@ -729,7 +792,7 @@ def prepare_rna_gff_fields(cols):
|
|
|
729
792
|
|
|
730
793
|
def get_trnas(trnas_file):
|
|
731
794
|
trnas = {}
|
|
732
|
-
with
|
|
795
|
+
with fileinput.hook_compressed(trnas_file, "r", encoding="utf-8") as f:
|
|
733
796
|
for line in f:
|
|
734
797
|
if not line.startswith("#"):
|
|
735
798
|
cols = line.split("\t")
|
|
@@ -738,13 +801,13 @@ def get_trnas(trnas_file):
|
|
|
738
801
|
line = line.replace("tRNAscan-SE", "tRNAscan-SE:2.0.9")
|
|
739
802
|
trnas.setdefault(contig, dict()).setdefault(
|
|
740
803
|
int(start), list()
|
|
741
|
-
).append(line.strip())
|
|
804
|
+
).append(line.strip().strip(";"))
|
|
742
805
|
return trnas
|
|
743
806
|
|
|
744
807
|
|
|
745
808
|
def load_crispr(crispr_file):
|
|
746
809
|
crispr_annotations = dict()
|
|
747
|
-
with
|
|
810
|
+
with fileinput.hook_compressed(crispr_file, "r", encoding="utf-8") as f:
|
|
748
811
|
record = list()
|
|
749
812
|
left_coord = ""
|
|
750
813
|
loc_contig = ""
|
|
@@ -791,7 +854,7 @@ def get_pseudogenes(pseudofinder_file):
|
|
|
791
854
|
pseudogenes = dict()
|
|
792
855
|
if not pseudofinder_file:
|
|
793
856
|
return pseudogenes
|
|
794
|
-
with
|
|
857
|
+
with fileinput.hook_compressed(pseudofinder_file, "r", encoding="utf-8") as file_in:
|
|
795
858
|
for line in file_in:
|
|
796
859
|
if not line.startswith("#"):
|
|
797
860
|
col9 = line.strip().split("\t")[8]
|
|
@@ -28,6 +28,17 @@ def write_results_to_file(
|
|
|
28
28
|
contig_list = check_for_additional_keys(
|
|
29
29
|
ncrnas, trnas, crispr_annotations, contig_list
|
|
30
30
|
)
|
|
31
|
+
# sort contigs by digit at the end of contig/genome accession
|
|
32
|
+
if contig_list[0].startswith(
|
|
33
|
+
"MGYG"
|
|
34
|
+
): # e.g. 'MGYG000500002_1', 'MGYG000500002_2', 'MGYG000500002_3'
|
|
35
|
+
contig_list = sorted(list(contig_list), key=lambda x: int(x.split("_")[-1]))
|
|
36
|
+
elif contig_list[0].startswith(
|
|
37
|
+
"ERZ"
|
|
38
|
+
): # e.g. 'ERZ1049444', 'ERZ1049445', 'ERZ1049446'
|
|
39
|
+
contig_list = sorted(
|
|
40
|
+
list(contig_list), key=lambda x: int(x.split("ERZ")[-1])
|
|
41
|
+
)
|
|
31
42
|
for contig in contig_list:
|
|
32
43
|
sorted_pos_list = sort_positions(
|
|
33
44
|
contig, main_gff_extended, ncrnas, trnas, crispr_annotations
|
|
@@ -17,8 +17,16 @@
|
|
|
17
17
|
|
|
18
18
|
import argparse
|
|
19
19
|
|
|
20
|
-
from gff_annotation_utils import
|
|
21
|
-
|
|
20
|
+
from mgnify_pipelines_toolkit.analysis.assembly.gff_annotation_utils import (
|
|
21
|
+
get_ncrnas,
|
|
22
|
+
get_trnas,
|
|
23
|
+
load_annotations,
|
|
24
|
+
load_crispr,
|
|
25
|
+
)
|
|
26
|
+
from mgnify_pipelines_toolkit.analysis.assembly.gff_file_utils import (
|
|
27
|
+
write_results_to_file,
|
|
28
|
+
print_pseudogene_report,
|
|
29
|
+
)
|
|
22
30
|
|
|
23
31
|
|
|
24
32
|
def main(
|
|
@@ -31,6 +39,7 @@ def main(
|
|
|
31
39
|
antismash_file,
|
|
32
40
|
gecco_file,
|
|
33
41
|
dbcan_file,
|
|
42
|
+
dbcan_cazys_file,
|
|
34
43
|
defense_finder_file,
|
|
35
44
|
pseudofinder_file,
|
|
36
45
|
rfam_file,
|
|
@@ -53,6 +62,7 @@ def main(
|
|
|
53
62
|
antismash_file,
|
|
54
63
|
gecco_file,
|
|
55
64
|
dbcan_file,
|
|
65
|
+
dbcan_cazys_file,
|
|
56
66
|
defense_finder_file,
|
|
57
67
|
pseudofinder_file,
|
|
58
68
|
)
|
|
@@ -66,7 +76,9 @@ def main(
|
|
|
66
76
|
if crispr_file:
|
|
67
77
|
crispr_annotations = load_crispr(crispr_file)
|
|
68
78
|
|
|
69
|
-
write_results_to_file(
|
|
79
|
+
write_results_to_file(
|
|
80
|
+
outfile, header, main_gff_extended, fasta, ncrnas, trnas, crispr_annotations
|
|
81
|
+
)
|
|
70
82
|
if pseudogene_report_file:
|
|
71
83
|
print_pseudogene_report(pseudogene_report_dict, pseudogene_report_file)
|
|
72
84
|
|
|
@@ -74,7 +86,7 @@ def main(
|
|
|
74
86
|
def parse_args():
|
|
75
87
|
parser = argparse.ArgumentParser(
|
|
76
88
|
description="The script extends a user-provided base GFF annotation file by incorporating "
|
|
77
|
-
|
|
89
|
+
"information extracted from the user-provided outputs of supplementary annotation tools.",
|
|
78
90
|
)
|
|
79
91
|
parser.add_argument(
|
|
80
92
|
"-g",
|
|
@@ -124,7 +136,12 @@ def parse_args():
|
|
|
124
136
|
)
|
|
125
137
|
parser.add_argument(
|
|
126
138
|
"--dbcan",
|
|
127
|
-
help="The GFF file produced by dbCAN post-processing script",
|
|
139
|
+
help="The GFF file produced by dbCAN post-processing script that uses cluster annotations",
|
|
140
|
+
required=False,
|
|
141
|
+
)
|
|
142
|
+
parser.add_argument(
|
|
143
|
+
"--dbcan-cazys",
|
|
144
|
+
help="The GFF file produced by dbCAN-CAZYs post-processing script",
|
|
128
145
|
required=False,
|
|
129
146
|
)
|
|
130
147
|
parser.add_argument(
|
|
@@ -149,7 +166,7 @@ def parse_args():
|
|
|
149
166
|
return parser.parse_args()
|
|
150
167
|
|
|
151
168
|
|
|
152
|
-
if __name__ ==
|
|
169
|
+
if __name__ == "__main__":
|
|
153
170
|
args = parse_args()
|
|
154
171
|
main(
|
|
155
172
|
args.gff_input,
|
|
@@ -161,10 +178,11 @@ if __name__ == '__main__':
|
|
|
161
178
|
args.antismash,
|
|
162
179
|
args.gecco,
|
|
163
180
|
args.dbcan,
|
|
181
|
+
args.dbcan_cazys,
|
|
164
182
|
args.defense_finder,
|
|
165
183
|
args.pseudofinder,
|
|
166
184
|
args.rfam,
|
|
167
185
|
args.trnascan,
|
|
168
186
|
args.outfile,
|
|
169
187
|
args.pseudogene_report,
|
|
170
|
-
|
|
188
|
+
)
|
|
@@ -0,0 +1,211 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
# -*- coding: utf-8 -*-
|
|
3
|
+
|
|
4
|
+
# Copyright 2023-2025 EMBL - European Bioinformatics Institute
|
|
5
|
+
#
|
|
6
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
7
|
+
# you may not use this file except in compliance with the License.
|
|
8
|
+
# You may obtain a copy of the License at
|
|
9
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
10
|
+
#
|
|
11
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
12
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
13
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
14
|
+
# See the License for the specific language governing permissions and
|
|
15
|
+
# limitations under the License.
|
|
16
|
+
#
|
|
17
|
+
|
|
18
|
+
import argparse
|
|
19
|
+
import fileinput
|
|
20
|
+
import logging
|
|
21
|
+
from pathlib import Path
|
|
22
|
+
import re
|
|
23
|
+
|
|
24
|
+
logging.basicConfig(level=logging.INFO)
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def main(hmm_file, overview_file, genome_gff, outfile, dbcan_version):
|
|
28
|
+
|
|
29
|
+
hmm_path = Path(hmm_file)
|
|
30
|
+
overview_path = Path(overview_file)
|
|
31
|
+
|
|
32
|
+
if not hmm_path.is_file():
|
|
33
|
+
raise FileNotFoundError(f"Input hmm path does not exist: {hmm_file}")
|
|
34
|
+
|
|
35
|
+
if not overview_path.is_file():
|
|
36
|
+
raise FileNotFoundError(f"Input overview path does not exist: {overview_file}")
|
|
37
|
+
|
|
38
|
+
substrates = load_substrates(hmm_path)
|
|
39
|
+
genome_gff_lines = load_gff(genome_gff)
|
|
40
|
+
|
|
41
|
+
print_gff(overview_file, outfile, dbcan_version, substrates, genome_gff_lines)
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def load_gff(gff):
|
|
45
|
+
genome_gff_lines = dict()
|
|
46
|
+
with fileinput.hook_compressed(gff, "rt") as gff:
|
|
47
|
+
for line in gff:
|
|
48
|
+
if line.startswith("##FASTA"):
|
|
49
|
+
return genome_gff_lines
|
|
50
|
+
|
|
51
|
+
fields = line.strip().split("\t")
|
|
52
|
+
if len(fields) != 9 or fields[2] != "CDS":
|
|
53
|
+
continue
|
|
54
|
+
|
|
55
|
+
if "Parent=" in line:
|
|
56
|
+
# Get transcript name from the 9th column for mettannotator
|
|
57
|
+
match = re.search(r"Parent=([^;]+)", fields[8])
|
|
58
|
+
elif "ID=" in line:
|
|
59
|
+
# Get transcript name from the 9th column for ASA
|
|
60
|
+
match = re.search(r"ID=([^;]+)", fields[8])
|
|
61
|
+
else:
|
|
62
|
+
logging.error(
|
|
63
|
+
"Not sure what gff annotation delimiter is in use. Exiting"
|
|
64
|
+
)
|
|
65
|
+
exit(1)
|
|
66
|
+
|
|
67
|
+
transcript_name = match.group(1)
|
|
68
|
+
genome_gff_lines.setdefault(transcript_name, []).append(line)
|
|
69
|
+
return genome_gff_lines
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def print_gff(overview_file, outfile, dbcan_version, substrates, genome_gff_lines):
|
|
73
|
+
with open(outfile, "w") as file_out:
|
|
74
|
+
file_out.write("##gff-version 3\n")
|
|
75
|
+
with fileinput.hook_compressed(overview_file, "rt") as file_in:
|
|
76
|
+
for line in file_in:
|
|
77
|
+
if line.startswith("MGYG") or line.startswith("ERZ"):
|
|
78
|
+
(
|
|
79
|
+
transcript,
|
|
80
|
+
ec_number_raw,
|
|
81
|
+
dbcan_hmmer,
|
|
82
|
+
dbcan_sub_ecami,
|
|
83
|
+
diamond,
|
|
84
|
+
num_of_tools,
|
|
85
|
+
) = line.strip().split("\t")
|
|
86
|
+
# EC is reported as 2.4.99.-:5 with :5 meaning 5 proteins in the subfamily have EC 2.4.99.-
|
|
87
|
+
|
|
88
|
+
ec_number = ""
|
|
89
|
+
ec_list = ec_number_raw.split("|")
|
|
90
|
+
for ec in ec_list:
|
|
91
|
+
if ec != "-":
|
|
92
|
+
ec_number += ec.split(":")[0] + "|"
|
|
93
|
+
|
|
94
|
+
ec_number = ec_number.strip("|")
|
|
95
|
+
|
|
96
|
+
# Dbcan recommends to use subfamily preference as dbcan_hmmer > dbcan_sub_ecami > diamond
|
|
97
|
+
# diamond is messier, so we don't report it here
|
|
98
|
+
if dbcan_hmmer != "-":
|
|
99
|
+
# the field dbcan_hmmer reports match positions in parentheses, clear them out first:
|
|
100
|
+
subfamily = dbcan_hmmer.split("(")[0]
|
|
101
|
+
elif dbcan_sub_ecami != "-":
|
|
102
|
+
subfamily = dbcan_sub_ecami
|
|
103
|
+
else:
|
|
104
|
+
continue
|
|
105
|
+
cleaned_substrates = ",".join(
|
|
106
|
+
sorted(
|
|
107
|
+
{
|
|
108
|
+
subsrate.strip()
|
|
109
|
+
for subsrate in substrates.get(transcript, "N/A").split(
|
|
110
|
+
","
|
|
111
|
+
)
|
|
112
|
+
}
|
|
113
|
+
)
|
|
114
|
+
)
|
|
115
|
+
# Assemble information to add to the 9th column
|
|
116
|
+
col9_parts = [
|
|
117
|
+
f"protein_family={subfamily}",
|
|
118
|
+
f"substrate_dbcan-sub={cleaned_substrates}",
|
|
119
|
+
]
|
|
120
|
+
|
|
121
|
+
if ec_number:
|
|
122
|
+
col9_parts.append(f"eC_number={ec_number}")
|
|
123
|
+
|
|
124
|
+
col9_parts.append(f"num_tools={num_of_tools}")
|
|
125
|
+
col9_text = ";".join(col9_parts)
|
|
126
|
+
|
|
127
|
+
for gff_line in genome_gff_lines[transcript]:
|
|
128
|
+
fields = gff_line.strip().split("\t")
|
|
129
|
+
# Replace the tool
|
|
130
|
+
fields[1] = f"dbCAN:{dbcan_version}"
|
|
131
|
+
# Replace the feature
|
|
132
|
+
fields[2] = "CAZyme"
|
|
133
|
+
# Replace the confidence value
|
|
134
|
+
fields[5] = "."
|
|
135
|
+
# Keep only the ID in the 9th column
|
|
136
|
+
attributes = fields[8].split(";")[0]
|
|
137
|
+
# Add dbcan information to the 9th column
|
|
138
|
+
attributes = f"{attributes};{col9_text};"
|
|
139
|
+
fields[8] = attributes
|
|
140
|
+
file_out.write("\t".join(fields) + "\n")
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
def load_substrates(hmm_path):
|
|
144
|
+
substrates = dict()
|
|
145
|
+
with fileinput.hook_compressed(hmm_path, "rt") as file_in:
|
|
146
|
+
header = next(file_in)
|
|
147
|
+
header_fields = header.strip().split("\t")
|
|
148
|
+
substrate_idx = header_fields.index("Substrate")
|
|
149
|
+
gene_idx = header_fields.index("Gene ID")
|
|
150
|
+
evalue_idx = header_fields.index("E Value")
|
|
151
|
+
for line in file_in:
|
|
152
|
+
fields = line.strip().split("\t")
|
|
153
|
+
if float(fields[evalue_idx]) < 1e-15: # evalue is the default from dbcan
|
|
154
|
+
substrate = fields[substrate_idx]
|
|
155
|
+
if not substrate == "-":
|
|
156
|
+
gene_id = fields[gene_idx]
|
|
157
|
+
substrates.setdefault(gene_id, []).append(substrate)
|
|
158
|
+
# resolve cases with multiple substrates
|
|
159
|
+
for gene_id, substrate_list in substrates.items():
|
|
160
|
+
substrate_list = list(set(substrate_list))
|
|
161
|
+
if len(substrate_list) == 1:
|
|
162
|
+
substrates[gene_id] = substrate_list[0]
|
|
163
|
+
else:
|
|
164
|
+
substrates[gene_id] = ",".join(substrate_list)
|
|
165
|
+
return substrates
|
|
166
|
+
|
|
167
|
+
|
|
168
|
+
def parse_args():
|
|
169
|
+
parser = argparse.ArgumentParser(
|
|
170
|
+
description=(
|
|
171
|
+
"The script takes dbCAN output for a eukaryotic genome and parses it to create a standalone GFF."
|
|
172
|
+
)
|
|
173
|
+
)
|
|
174
|
+
parser.add_argument(
|
|
175
|
+
"-hmm",
|
|
176
|
+
dest="hmm_file",
|
|
177
|
+
required=True,
|
|
178
|
+
help="Path to the hmm file.",
|
|
179
|
+
)
|
|
180
|
+
parser.add_argument(
|
|
181
|
+
"-ov",
|
|
182
|
+
dest="overview_file",
|
|
183
|
+
required=True,
|
|
184
|
+
help="Path to the overview file.",
|
|
185
|
+
)
|
|
186
|
+
parser.add_argument(
|
|
187
|
+
"-g",
|
|
188
|
+
dest="genome_gff",
|
|
189
|
+
required=True,
|
|
190
|
+
help="Path to the genome GFF.",
|
|
191
|
+
)
|
|
192
|
+
parser.add_argument(
|
|
193
|
+
"-o",
|
|
194
|
+
dest="outfile",
|
|
195
|
+
required=True,
|
|
196
|
+
help="Path to the output file.",
|
|
197
|
+
)
|
|
198
|
+
parser.add_argument(
|
|
199
|
+
"-v",
|
|
200
|
+
dest="dbcan_ver",
|
|
201
|
+
required=True,
|
|
202
|
+
help="dbCAN version used.",
|
|
203
|
+
)
|
|
204
|
+
return parser.parse_args()
|
|
205
|
+
|
|
206
|
+
|
|
207
|
+
if __name__ == "__main__":
|
|
208
|
+
args = parse_args()
|
|
209
|
+
main(
|
|
210
|
+
args.hmm_file, args.overview_file, args.genome_gff, args.outfile, args.dbcan_ver
|
|
211
|
+
)
|
|
@@ -0,0 +1,162 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
|
|
3
|
+
# Copyright 2023-2024 EMBL - European Bioinformatics Institute
|
|
4
|
+
#
|
|
5
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
6
|
+
# you may not use this file except in compliance with the License.
|
|
7
|
+
# You may obtain a copy of the License at
|
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
#
|
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
# See the License for the specific language governing permissions and
|
|
14
|
+
# limitations under the License.
|
|
15
|
+
#
|
|
16
|
+
|
|
17
|
+
import argparse
|
|
18
|
+
import fileinput
|
|
19
|
+
import logging
|
|
20
|
+
from pathlib import Path
|
|
21
|
+
|
|
22
|
+
logging.basicConfig(level=logging.INFO)
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def main(standard_file, substrate_file, outfile, dbcan_version):
|
|
26
|
+
standard_path = Path(standard_file)
|
|
27
|
+
substrate_path = Path(substrate_file)
|
|
28
|
+
|
|
29
|
+
if not standard_path.exists():
|
|
30
|
+
raise FileNotFoundError(f"Input standards path does not exist: {standard_file}")
|
|
31
|
+
|
|
32
|
+
if not substrate_path.exists():
|
|
33
|
+
raise FileNotFoundError(
|
|
34
|
+
f"Input substrate path does not exist: {substrate_file}"
|
|
35
|
+
)
|
|
36
|
+
|
|
37
|
+
substrates = load_substrates(substrate_path)
|
|
38
|
+
cgc_locations = load_cgcs(standard_path)
|
|
39
|
+
print_gff(standard_path, outfile, dbcan_version, substrates, cgc_locations)
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def load_cgcs(standard_path):
|
|
43
|
+
cgc_locations = dict()
|
|
44
|
+
with fileinput.hook_compressed(standard_path, "rt") as file_in:
|
|
45
|
+
for line in file_in:
|
|
46
|
+
if not line.startswith("CGC#"):
|
|
47
|
+
cgc, _, contig, _, start, end, _, _ = line.strip().split("\t")
|
|
48
|
+
cgc_id = f"{contig}_{cgc}"
|
|
49
|
+
if cgc_id in cgc_locations:
|
|
50
|
+
if cgc_locations[cgc_id]["start"] > int(start):
|
|
51
|
+
cgc_locations[cgc_id]["start"] = int(start)
|
|
52
|
+
if cgc_locations[cgc_id]["end"] < int(end):
|
|
53
|
+
cgc_locations[cgc_id]["end"] = int(end)
|
|
54
|
+
else:
|
|
55
|
+
cgc_locations[cgc_id] = {
|
|
56
|
+
"start": int(start),
|
|
57
|
+
"end": int(end),
|
|
58
|
+
"contig": contig,
|
|
59
|
+
}
|
|
60
|
+
return cgc_locations
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def print_gff(standard_path, outfile, dbcan_version, substrates, cgc_locations):
|
|
64
|
+
with open(outfile, "w") as file_out:
|
|
65
|
+
file_out.write("##gff-version 3\n")
|
|
66
|
+
cgcs_printed = list()
|
|
67
|
+
with fileinput.hook_compressed(standard_path, "rt") as file_in:
|
|
68
|
+
for line in file_in:
|
|
69
|
+
if not line.startswith("CGC#"):
|
|
70
|
+
cgc, gene_type, contig, prot_id, start, end, strand, protein_fam = (
|
|
71
|
+
line.strip().split("\t")
|
|
72
|
+
)
|
|
73
|
+
cgc_id = f"{contig}_{cgc}"
|
|
74
|
+
protein_fam = protein_fam.replace(" ", "")
|
|
75
|
+
if cgc_id not in cgcs_printed:
|
|
76
|
+
substrate = (
|
|
77
|
+
substrates[cgc_id]
|
|
78
|
+
if cgc_id in substrates
|
|
79
|
+
else "substrate_dbcan-pul=N/A;substrate_dbcan-sub=N/A"
|
|
80
|
+
)
|
|
81
|
+
file_out.write(
|
|
82
|
+
"{}\tdbCAN:{}\tpredicted PUL\t{}\t{}\t.\t.\t.\tID={};{}\n".format(
|
|
83
|
+
contig,
|
|
84
|
+
dbcan_version,
|
|
85
|
+
cgc_locations[cgc_id]["start"],
|
|
86
|
+
cgc_locations[cgc_id]["end"],
|
|
87
|
+
cgc_id,
|
|
88
|
+
substrate,
|
|
89
|
+
)
|
|
90
|
+
)
|
|
91
|
+
cgcs_printed.append(cgc_id)
|
|
92
|
+
file_out.write(
|
|
93
|
+
(
|
|
94
|
+
f"{contig}\tdbCAN:{dbcan_version}\t{gene_type}\t{start}"
|
|
95
|
+
+ f"\t{end}\t.\t{strand}\t.\tID={prot_id};Parent={cgc_id};protein_family={protein_fam}\n"
|
|
96
|
+
)
|
|
97
|
+
)
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
def load_substrates(substrate_path):
|
|
101
|
+
substrates = dict()
|
|
102
|
+
with fileinput.hook_compressed(substrate_path, "rt") as file_in:
|
|
103
|
+
for line in file_in:
|
|
104
|
+
if not line.startswith("#"):
|
|
105
|
+
parts = line.strip().split("\t")
|
|
106
|
+
cgc_parts = parts[0].rsplit("|", 1)
|
|
107
|
+
cgc = "_".join(cgc_parts)
|
|
108
|
+
try:
|
|
109
|
+
substrate_pul = parts[2]
|
|
110
|
+
except IndexError:
|
|
111
|
+
substrate_pul = "N/A"
|
|
112
|
+
try:
|
|
113
|
+
substrate_ecami = parts[5]
|
|
114
|
+
except IndexError:
|
|
115
|
+
substrate_ecami = "N/A"
|
|
116
|
+
if not substrate_pul:
|
|
117
|
+
substrate_pul = "N/A"
|
|
118
|
+
if not substrate_ecami:
|
|
119
|
+
substrate_ecami = "N/A"
|
|
120
|
+
substrates[cgc] = (
|
|
121
|
+
f"substrate_dbcan-pul={substrate_pul};substrate_dbcan-sub={substrate_ecami}"
|
|
122
|
+
)
|
|
123
|
+
|
|
124
|
+
return substrates
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
def parse_args():
|
|
128
|
+
parser = argparse.ArgumentParser(
|
|
129
|
+
description=(
|
|
130
|
+
"The script takes dbCAN output and parses it to create a standalone GFF."
|
|
131
|
+
)
|
|
132
|
+
)
|
|
133
|
+
parser.add_argument(
|
|
134
|
+
"-st",
|
|
135
|
+
dest="standard_file",
|
|
136
|
+
required=True,
|
|
137
|
+
help="Path to the standard file (*cgc_standard.out)",
|
|
138
|
+
)
|
|
139
|
+
parser.add_argument(
|
|
140
|
+
"-sb",
|
|
141
|
+
dest="substrate_file",
|
|
142
|
+
required=True,
|
|
143
|
+
help="Path to the substrate file (*substrate.out)",
|
|
144
|
+
)
|
|
145
|
+
parser.add_argument(
|
|
146
|
+
"-o",
|
|
147
|
+
dest="outfile",
|
|
148
|
+
required=True,
|
|
149
|
+
help="Path to the output file.",
|
|
150
|
+
)
|
|
151
|
+
parser.add_argument(
|
|
152
|
+
"-v",
|
|
153
|
+
dest="dbcan_ver",
|
|
154
|
+
required=True,
|
|
155
|
+
help="dbCAN version used.",
|
|
156
|
+
)
|
|
157
|
+
return parser.parse_args()
|
|
158
|
+
|
|
159
|
+
|
|
160
|
+
if __name__ == "__main__":
|
|
161
|
+
args = parse_args()
|
|
162
|
+
main(args.standard_file, args.substrate_file, args.outfile, args.dbcan_ver)
|
|
@@ -0,0 +1,230 @@
|
|
|
1
|
+
#!/usr/bin/env python
|
|
2
|
+
# -*- coding: utf-8 -*-
|
|
3
|
+
|
|
4
|
+
# Copyright 2025 EMBL - European Bioinformatics Institute
|
|
5
|
+
#
|
|
6
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
7
|
+
# you may not use this file except in compliance with the License.
|
|
8
|
+
# You may obtain a copy of the License at
|
|
9
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
10
|
+
#
|
|
11
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
12
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
13
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
14
|
+
# See the License for the specific language governing permissions and
|
|
15
|
+
# limitations under the License.
|
|
16
|
+
|
|
17
|
+
import argparse
|
|
18
|
+
import fileinput
|
|
19
|
+
import logging
|
|
20
|
+
import pandas as pd
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
logging.basicConfig(
|
|
24
|
+
level=logging.INFO, format="%(asctime)s - %(levelname)s: %(message)s"
|
|
25
|
+
)
|
|
26
|
+
|
|
27
|
+
ANTISMASH_VERSION = "7.1.x"
|
|
28
|
+
|
|
29
|
+
f"""
|
|
30
|
+
Script parses antismash GFF output and adds descriptions from pre-parsed glossary https://docs.antismash.secondarymetabolites.org/glossary/.
|
|
31
|
+
Glossary was taken from version {ANTISMASH_VERSION} and commit dbeeb0e https://github.com/antismash/documentation/blob/master/docs/glossary.md
|
|
32
|
+
"""
|
|
33
|
+
|
|
34
|
+
DESCRIPTIONS = {
|
|
35
|
+
"2dos": "2-deoxy-streptamine aminoglycoside",
|
|
36
|
+
"acyl_amino_acids": "N-acyl amino acid",
|
|
37
|
+
"amglyccycl": "Aminoglycoside/aminocyclitol",
|
|
38
|
+
"aminocoumarin": "Aminocoumarin",
|
|
39
|
+
"aminopolycarboxylic-acid": "Aminopolycarboxylic acid metallophores (doi:10.1039/C8MT00009C)",
|
|
40
|
+
"archaeal-ripp": "Archaeal RiPPs (doi:10.1021/jacs.2c00521 supplemental)",
|
|
41
|
+
"arylpolyene": "Aryl polyene",
|
|
42
|
+
"atropopeptide": "Atropopeptide RiPPs, e.g. scabrirubin and tryptorubin",
|
|
43
|
+
"azoxy-crosslink": "axoxy compounds formed by carboxilic cross-link",
|
|
44
|
+
"azoxy-dimer": "axoxy compounds formed by dimerisation",
|
|
45
|
+
"benzoxazole": "Benzoxazoles",
|
|
46
|
+
"betalactone": "Beta-lactone containing protease inhibitor",
|
|
47
|
+
"blactam": "β-lactam",
|
|
48
|
+
"bottromycin": "Bottromycin",
|
|
49
|
+
"butyrolactone": "Butyrolactone",
|
|
50
|
+
"cdps": "tRNA-dependent cyclodipeptide synthases",
|
|
51
|
+
"crocagin": "Crocagin-like",
|
|
52
|
+
"cyanobactin": "Cyanobactins like patellamide (AY986476)",
|
|
53
|
+
"cyclic-lactone-autoinducer": "agrD-like cyclic lactone autoinducer peptides (AF001782)",
|
|
54
|
+
"cytokinin": "Adenine-type cytokinins, e.g. fusatin and trans-zeatin",
|
|
55
|
+
"darobactin": "Darobactin-like compounds",
|
|
56
|
+
"deazapurine": "Deazapurine",
|
|
57
|
+
"ectoine": "Ectoine",
|
|
58
|
+
"epipeptide": "D-amino-acid containing RiPPs such as yydF (D78193)",
|
|
59
|
+
"fungal_cdps": "Fungal cyclodipeptide synthases",
|
|
60
|
+
"fungal-ripp": "Fungal RiPP with POP or UstH peptidase types and a modification",
|
|
61
|
+
"furan": "Furan",
|
|
62
|
+
"glycocin": "Glycocin",
|
|
63
|
+
"guanidinotides": "Pheganomycin-style protein ligase-containing cluster",
|
|
64
|
+
"hgle-ks": "Heterocyst glycolipid synthase-like PKS",
|
|
65
|
+
"hr-t2pks": "Highly reducing type II PKS like ishigamide and skyllamycin",
|
|
66
|
+
"hserlactone": "Homoserine lactone",
|
|
67
|
+
"hydrogen-cyanide": "Hydrogen cyanide (AF208523, doi:10.1128/jb.182.24.6940-6949.20)",
|
|
68
|
+
"hydroxy-tropolone": "7-hydroxytropolone-like cluster",
|
|
69
|
+
"indole": "Indole",
|
|
70
|
+
"isocyanide": "Isocyanides (doi:10.1093/nar/gkad573)",
|
|
71
|
+
"nrp with isocyanide": "Isocyanides (doi:0.1128/mBio.00785-18)",
|
|
72
|
+
"ladderane": "Ladderane",
|
|
73
|
+
"lanthipeptide class i": "Class I lanthipeptides like nisin",
|
|
74
|
+
"lanthipeptide class ii": "Class II lanthipeptides like mutacin II (U40620)",
|
|
75
|
+
"lanthipeptide class iii": "Class III lanthipeptides like labyrinthopeptin (FN178622)",
|
|
76
|
+
"lanthipeptide class iv": "Class IV lanthipeptides like venezuelin (HQ328852)",
|
|
77
|
+
"lanthipeptide class v": "Glycosylated lanthipeptide/linaridin hybrids like MT210103",
|
|
78
|
+
"lassopeptide": "Lasso peptide",
|
|
79
|
+
"leupeptin": "leupeptin-like compounds",
|
|
80
|
+
"linaridin": "Linear arid peptide such as cypemycin (HQ148718) and salinipeptin (MG788286)",
|
|
81
|
+
"lincosamides": "NRPS-adjacent biosynthesis of lincosamides",
|
|
82
|
+
"lipolanthine": "Lanthipeptide class containing N-terminal fatty acids such as MG673929",
|
|
83
|
+
"melanin": "Melanin",
|
|
84
|
+
"methanobactin": "Copper-chelating/transporting peptides (doi:10.1126/science.aap9437)",
|
|
85
|
+
"microviridin": "Microviridin",
|
|
86
|
+
"mycosporine": "Molecules containing mycosporine-like amino acid",
|
|
87
|
+
"naggn": "N-acetylglutaminylglutamine amide",
|
|
88
|
+
"napaa": "Non-alpha poly-amino acids like e-Polylysin",
|
|
89
|
+
"ni-siderophore": "NRPS-independent, IucA/IucC-like siderophores (*siderophore* prior to 7.0)",
|
|
90
|
+
"nitropropanoic-acid": "3-Nitropropanoic acid (neurotoxin)",
|
|
91
|
+
"nrps": "Non-ribosomal peptide synthetase",
|
|
92
|
+
"nrp-metallophore": "Non-ribosomal peptide metallophores",
|
|
93
|
+
"nucleoside": "Nucleoside",
|
|
94
|
+
"oligosaccharide": "Oligosaccharide",
|
|
95
|
+
"opine-like-metallophore": "Opine-like zincophores like staphylopine (doi:10.1128/mSystems.00554-20)",
|
|
96
|
+
"other": "Cluster containing a secondary metabolite-related protein that does not fit into any other category",
|
|
97
|
+
"pbde": "Polybrominated diphenyl ether",
|
|
98
|
+
"phenazine": "Phenazine",
|
|
99
|
+
"phosphoglycolipid": "Phosphoglycolipid",
|
|
100
|
+
"phosphonate": "Phosphonate",
|
|
101
|
+
"polyhalogenated-pyrrole": "Polyhalogenated pyrrole",
|
|
102
|
+
"polyyne": "Polyyne",
|
|
103
|
+
"ppys-ks": "PPY-like pyrone",
|
|
104
|
+
"prodigiosin": "Serratia-type non-traditional PKS prodigiosin biosynthesis pathway",
|
|
105
|
+
"proteusin": "Proteusin",
|
|
106
|
+
"pufa": "Polyunsaturated fatty acid",
|
|
107
|
+
"pyrrolidine": "Pyrrolidines like described in BGC0001510",
|
|
108
|
+
"ranthipeptide": "Cys-rich peptides (aka. SCIFF: six Cys in fourty-five) like in CP001581:3481278-3502939",
|
|
109
|
+
"ras-ripp": "Streptide-like thioether-bond RiPPs",
|
|
110
|
+
"rcdps": "Fungal Arginine-containing cyclic dipeptides",
|
|
111
|
+
"redox-cofactor": "Redox-cofactors such as PQQ (NC_021985:1458906-1494876)",
|
|
112
|
+
"resorcinol": "Resorcinol",
|
|
113
|
+
"sactipeptide": "Sactipeptide",
|
|
114
|
+
"spliceotide": "RiPPs containing plpX type spliceases (NZ_KB235920:17899-42115)",
|
|
115
|
+
"t1pks": "Type I PKS (Polyketide synthase)",
|
|
116
|
+
"t2pks": "Type II PKS",
|
|
117
|
+
"t3pks": "Type III PKS",
|
|
118
|
+
"terpene": "Terpene",
|
|
119
|
+
"thioamitides": "Thioamitide RiPPs as found in JOBF01000011",
|
|
120
|
+
"thioamide-nrp": "Thioamide-containing non-ribosomal peptide",
|
|
121
|
+
"transat-pks": "Trans-AT PKS",
|
|
122
|
+
"triceptide": "Triceptides",
|
|
123
|
+
"tropodithietic-acid": "Tropodithietic acid",
|
|
124
|
+
"fungal-ripp-like": "Fungal RiPP-likes",
|
|
125
|
+
"nrps-like": "NRPS-like fragment",
|
|
126
|
+
"phosphonate-like": "Phosphonate-like (prior to 7.0 this was the phosphonate rule)",
|
|
127
|
+
"pks-like": "Other types of PKS",
|
|
128
|
+
"ripp-like": "Other unspecified ribosomally synthesised and post-translationally modified peptide product (RiPP)",
|
|
129
|
+
"rre-containing": "RRE-element containing cluster",
|
|
130
|
+
"terpene-precursor": "Compound likely used as a terpene precursor",
|
|
131
|
+
"transat-pks-like": "Trans-AT PKS fragment, with trans-AT domain not found",
|
|
132
|
+
"fatty_acid": "Fatty acid (loose strictness, likely from primary metabolism)",
|
|
133
|
+
"halogenated": "Halogenase-containing cluster, potentially generating a halogenated product",
|
|
134
|
+
"lysine": "Fungal lysine primary metabolism",
|
|
135
|
+
"saccharide": "Saccharide (loose strictness, likely from primary metabolism)",
|
|
136
|
+
"lap": "Linear azol(in)e-containing peptides",
|
|
137
|
+
"mycosporine-like": "Molecules containing mycosporine-like amino acid",
|
|
138
|
+
"thiopeptide": "Thiopeptide",
|
|
139
|
+
"siderophore": "Siderophore",
|
|
140
|
+
"bacteriocin": "Bacteriocin or other unspecified ribosomally synthesised and post-translationally modified peptide product (RiPP)",
|
|
141
|
+
"fused": "Pheganomycin-style protein ligase-containing cluster",
|
|
142
|
+
"head_to_tail": "Head-to-tail cyclised RiPP (subtilosin-like)",
|
|
143
|
+
"lanthidin": "Glycosylated lanthipeptide/linaridin hybrids like MT210103",
|
|
144
|
+
"lanthipeptide": "Lanthipeptides",
|
|
145
|
+
"tfua-related": "TfuA-related RiPPs",
|
|
146
|
+
"otherks": "Other types of PKS",
|
|
147
|
+
"microcin": "Microcin",
|
|
148
|
+
"cf_saccharide": "Possible saccharide",
|
|
149
|
+
"cf_fatty_acid": "Possible fatty acid",
|
|
150
|
+
"cf_putative": "Putative cluster of unknown type identified with the ClusterFinder algorithm",
|
|
151
|
+
}
|
|
152
|
+
|
|
153
|
+
|
|
154
|
+
def parse_args():
|
|
155
|
+
description = (
|
|
156
|
+
"antiSMASH output summary generator. "
|
|
157
|
+
"Script takes regions from GFF and counts its appearance in annotation. "
|
|
158
|
+
"Output columns contain classID, descriptions and count. "
|
|
159
|
+
f"Descriptions were taken from pre-parsed glossary provided on antiSMASH website. "
|
|
160
|
+
f"Current script supports antiSMASH results for version {ANTISMASH_VERSION} and older."
|
|
161
|
+
)
|
|
162
|
+
parser = argparse.ArgumentParser(description=description)
|
|
163
|
+
parser.add_argument("-i", "--antismash-gff", help="antiSMASH GFF", required=True)
|
|
164
|
+
parser.add_argument(
|
|
165
|
+
"-o", "--output", help="Antisamsh summary TSV output file.", required=True
|
|
166
|
+
)
|
|
167
|
+
parser.add_argument(
|
|
168
|
+
"-a",
|
|
169
|
+
"--antismash-version",
|
|
170
|
+
help="antiSMASH version that was used to generate GFF",
|
|
171
|
+
required=False,
|
|
172
|
+
default=ANTISMASH_VERSION,
|
|
173
|
+
)
|
|
174
|
+
args = parser.parse_args()
|
|
175
|
+
if args.antismash_version > ANTISMASH_VERSION:
|
|
176
|
+
logging.error(
|
|
177
|
+
"Provided version of antiSMASH is bigger than supported. "
|
|
178
|
+
"Please, make sure you have updated descriptions dictionary. Exit."
|
|
179
|
+
)
|
|
180
|
+
exit(1)
|
|
181
|
+
return args.antismash_gff, args.output
|
|
182
|
+
|
|
183
|
+
|
|
184
|
+
def main():
|
|
185
|
+
input_gff, output_filename = parse_args()
|
|
186
|
+
dict_list = []
|
|
187
|
+
with fileinput.hook_compressed(input_gff, "r") as file_in:
|
|
188
|
+
# TODO: to be merged with the GFF toolkit
|
|
189
|
+
for line in file_in:
|
|
190
|
+
if line.startswith("#"):
|
|
191
|
+
continue
|
|
192
|
+
info = line.strip().split("\t")[8].split(";")
|
|
193
|
+
entry_dict = {}
|
|
194
|
+
for pair in info:
|
|
195
|
+
key, value = pair.split(
|
|
196
|
+
"=", 1
|
|
197
|
+
) # Ensure split only occurs at the first '=' occurrence
|
|
198
|
+
entry_dict[key] = value
|
|
199
|
+
dict_list.append(entry_dict)
|
|
200
|
+
|
|
201
|
+
# Convert to DataFrame
|
|
202
|
+
df = pd.DataFrame(dict_list)
|
|
203
|
+
df = df[df["product"].notna()]
|
|
204
|
+
df_grouped = (
|
|
205
|
+
df.groupby(["product"]).size().reset_index(name="Count")
|
|
206
|
+
).sort_values(by="Count", ascending=False)
|
|
207
|
+
|
|
208
|
+
df_grouped = df_grouped.rename(
|
|
209
|
+
columns={
|
|
210
|
+
"product": "label",
|
|
211
|
+
}
|
|
212
|
+
)
|
|
213
|
+
df_grouped["Description"] = df_grouped["label"].apply(
|
|
214
|
+
lambda x: ",".join(
|
|
215
|
+
[
|
|
216
|
+
DESCRIPTIONS.get(cls.strip().lower(), cls.strip())
|
|
217
|
+
for cls in x.split(",")
|
|
218
|
+
]
|
|
219
|
+
)
|
|
220
|
+
)
|
|
221
|
+
df_grouped = df_grouped[["label", "Description", "Count"]]
|
|
222
|
+
df_grouped = df_grouped.rename(columns={
|
|
223
|
+
"Description": "description",
|
|
224
|
+
"Count": "count"
|
|
225
|
+
})
|
|
226
|
+
df_grouped.to_csv(output_filename, sep="\t", index=False)
|
|
227
|
+
|
|
228
|
+
|
|
229
|
+
if __name__ == "__main__":
|
|
230
|
+
main()
|
|
@@ -0,0 +1,119 @@
|
|
|
1
|
+
#!/usr/bin/env python
|
|
2
|
+
# -*- coding: utf-8 -*-
|
|
3
|
+
|
|
4
|
+
# Copyright 2025 EMBL - European Bioinformatics Institute
|
|
5
|
+
#
|
|
6
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
7
|
+
# you may not use this file except in compliance with the License.
|
|
8
|
+
# You may obtain a copy of the License at
|
|
9
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
10
|
+
#
|
|
11
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
12
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
13
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
14
|
+
# See the License for the specific language governing permissions and
|
|
15
|
+
# limitations under the License.
|
|
16
|
+
|
|
17
|
+
import argparse
|
|
18
|
+
import fileinput
|
|
19
|
+
import logging
|
|
20
|
+
import pandas as pd
|
|
21
|
+
|
|
22
|
+
logging.basicConfig(
|
|
23
|
+
level=logging.INFO, format="%(asctime)s - %(levelname)s: %(message)s"
|
|
24
|
+
)
|
|
25
|
+
|
|
26
|
+
SANNTIS_VERSION = "0.9.4.1"
|
|
27
|
+
|
|
28
|
+
f"""
|
|
29
|
+
Script parses SanntiS GFF output and adds descriptions of annotated MIBiGs classes.
|
|
30
|
+
Descriptions were pre-parsed for version {SANNTIS_VERSION} and stored as a dictionary.
|
|
31
|
+
"""
|
|
32
|
+
|
|
33
|
+
DESCRIPTIONS = {
|
|
34
|
+
"Polyketide": "Built from iterative condensation of acetate units derived from acetyl-CoA",
|
|
35
|
+
"Terpene": "Composed of isoprene (C5) units derived from isopentenyl pyrophosphate",
|
|
36
|
+
"Alkaloid": "Nitrogen-containing compounds derived from amino acids (e.g., ornithine, lysine, tyrosine, tryptophan)",
|
|
37
|
+
"RiPP": "Ribosomally synthesised and Post-translationally modified Peptide",
|
|
38
|
+
"NRP": "Nonribosomal Peptide",
|
|
39
|
+
"Saccharide": "Carbohydrate-based natural products (e.g., aminoglycoside antibiotics)",
|
|
40
|
+
"Other": "Catch-all class for clusters encoding metabolites outside main classes (e.g., cyclitols, indolocarbazoles, and phosphonates)",
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def parse_args():
|
|
45
|
+
description = (
|
|
46
|
+
"Sanntis output summary generator. "
|
|
47
|
+
"Script takes SanntiS GFF and counts pairs of (nearest_MiBIG, nearest_MiBIG_class)."
|
|
48
|
+
"It also adds pre-parsed descriptions of classes stored in that script as a dictionary. "
|
|
49
|
+
f"Descriptions were taken from SanntiS docs v{SANNTIS_VERSION}."
|
|
50
|
+
)
|
|
51
|
+
parser = argparse.ArgumentParser(description=description)
|
|
52
|
+
parser.add_argument("-i", "--sanntis-gff", help="SanntiS GFF", required=True)
|
|
53
|
+
parser.add_argument(
|
|
54
|
+
"-o", "--output", help="SanntiS summary TSV output file.", required=True
|
|
55
|
+
)
|
|
56
|
+
args = parser.parse_args()
|
|
57
|
+
return args.sanntis_gff, args.output
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def main():
|
|
61
|
+
input_gff, output_filename = parse_args()
|
|
62
|
+
dict_list = []
|
|
63
|
+
with fileinput.hook_compressed(input_gff, "r") as file_in:
|
|
64
|
+
# TODO: to be merged with the GFF toolkit
|
|
65
|
+
for line in file_in:
|
|
66
|
+
if line.startswith("#"):
|
|
67
|
+
continue
|
|
68
|
+
info = line.strip().split("\t")[8].split(";")
|
|
69
|
+
entry_dict = {}
|
|
70
|
+
# TODO: merge this with the GFF toolkit GFF reader
|
|
71
|
+
for pair in info:
|
|
72
|
+
key, value = pair.split(
|
|
73
|
+
"=", 1
|
|
74
|
+
) # Ensure split only occurs at the first '=' occurrence
|
|
75
|
+
entry_dict[key] = value
|
|
76
|
+
dict_list.append(entry_dict)
|
|
77
|
+
|
|
78
|
+
# Convert to DataFrame
|
|
79
|
+
df = pd.DataFrame(dict_list)
|
|
80
|
+
df = df.rename(
|
|
81
|
+
columns={
|
|
82
|
+
"nearest_MiBIG": "nearest_MIBiG",
|
|
83
|
+
"nearest_MiBIG_class": "nearest_MIBiG_class",
|
|
84
|
+
}
|
|
85
|
+
)
|
|
86
|
+
df_grouped = (
|
|
87
|
+
df.groupby(["nearest_MIBiG", "nearest_MIBiG_class"])
|
|
88
|
+
.size()
|
|
89
|
+
.reset_index(name="Count")
|
|
90
|
+
)
|
|
91
|
+
df_grouped = df_grouped.sort_values(by="Count", ascending=False)
|
|
92
|
+
|
|
93
|
+
df_desc = pd.DataFrame(
|
|
94
|
+
list(DESCRIPTIONS.items()), columns=["MIBiG_class", "Description"]
|
|
95
|
+
)
|
|
96
|
+
df_desc = df_desc.set_index("MIBiG_class")
|
|
97
|
+
df_merged = df_grouped.merge(
|
|
98
|
+
df_desc, left_on="nearest_MIBiG_class", right_index=True, how="left"
|
|
99
|
+
)
|
|
100
|
+
df_merged["Description"] = df_merged.apply(
|
|
101
|
+
lambda row: row["nearest_MIBiG_class"].replace(
|
|
102
|
+
"NRP", df_desc.loc["NRP"]["Description"]
|
|
103
|
+
)
|
|
104
|
+
if pd.isna(row["Description"]) and "NRP" in row["nearest_MIBiG_class"]
|
|
105
|
+
else row["Description"],
|
|
106
|
+
axis=1,
|
|
107
|
+
)
|
|
108
|
+
df_merged = df_merged[
|
|
109
|
+
["nearest_MIBiG", "nearest_MIBiG_class", "Description", "Count"]
|
|
110
|
+
]
|
|
111
|
+
df_merged = df_merged.rename(columns={
|
|
112
|
+
"Description": "description",
|
|
113
|
+
"Count": "count"
|
|
114
|
+
})
|
|
115
|
+
df_merged.to_csv(output_filename, sep="\t", index=False)
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
if __name__ == "__main__":
|
|
119
|
+
main()
|
|
@@ -62,9 +62,12 @@ class TableModifier:
|
|
|
62
62
|
self.output_file = output_file
|
|
63
63
|
|
|
64
64
|
def modify_table(self):
|
|
65
|
-
with
|
|
66
|
-
|
|
67
|
-
|
|
65
|
+
with (
|
|
66
|
+
fileinput.hook_compressed(
|
|
67
|
+
self.input_file, "r", encoding="utf-8"
|
|
68
|
+
) as file_in,
|
|
69
|
+
open(self.output_file, "w") as file_out,
|
|
70
|
+
):
|
|
68
71
|
header_written = False
|
|
69
72
|
separator_line, header = "", ""
|
|
70
73
|
for line in file_in:
|
|
@@ -26,9 +26,9 @@ MAX_INTERNAL_PRIMER_PROPORTION = 0.2
|
|
|
26
26
|
# used by library_strategy_checker in analysis.shared
|
|
27
27
|
MIN_AMPLICON_STRATEGY_CHECK = 0.30
|
|
28
28
|
|
|
29
|
+
|
|
29
30
|
# used by markergene_study_summary in analysis.shared
|
|
30
31
|
MAJORITY_MARKER_PROPORTION = 0.45
|
|
31
|
-
|
|
32
32
|
# used by gff_toolkit in analysis.assembly
|
|
33
33
|
EVALUE_CUTOFF_IPS = 1e-10
|
|
34
34
|
EVALUE_CUTOFF_EGGNOG = 1e-10
|
{mgnify_pipelines_toolkit-1.0.4.dist-info → mgnify_pipelines_toolkit-1.0.5.dist-info}/RECORD
RENAMED
|
@@ -16,15 +16,19 @@ mgnify_pipelines_toolkit/analysis/assembly/add_rhea_chebi_annotation.py,sha256=N
|
|
|
16
16
|
mgnify_pipelines_toolkit/analysis/assembly/antismash_gff_builder.py,sha256=wXrw1B-z4hOu5oA27Vp1WYxGP2Mk6ZY4i_T5jDZgek0,6954
|
|
17
17
|
mgnify_pipelines_toolkit/analysis/assembly/combined_gene_caller_merge.py,sha256=Pq-9RSt3RCxzDMQVW1VHlHF4NtpVwCWFbg2CMkvpZZc,19089
|
|
18
18
|
mgnify_pipelines_toolkit/analysis/assembly/generate_gaf.py,sha256=2T4T7aXMGPac-LZUXJF3lOUzZZF50dAKkKTSaO-4idQ,3587
|
|
19
|
-
mgnify_pipelines_toolkit/analysis/assembly/gff_annotation_utils.py,sha256=
|
|
20
|
-
mgnify_pipelines_toolkit/analysis/assembly/gff_file_utils.py,sha256=
|
|
21
|
-
mgnify_pipelines_toolkit/analysis/assembly/gff_toolkit.py,sha256=
|
|
19
|
+
mgnify_pipelines_toolkit/analysis/assembly/gff_annotation_utils.py,sha256=6gbCRlEX1eBqzFYjOt3og-961dZ--QsCJL-7l5nzg1k,33992
|
|
20
|
+
mgnify_pipelines_toolkit/analysis/assembly/gff_file_utils.py,sha256=_4J31wAjK5B15p9lDzTG2wmZdyoZkOgmy7Kp_w8lTeE,3812
|
|
21
|
+
mgnify_pipelines_toolkit/analysis/assembly/gff_toolkit.py,sha256=n2rbS8UHevF8OB4umo7QivS6v4avlgsCaRwrBbVmZ24,5398
|
|
22
22
|
mgnify_pipelines_toolkit/analysis/assembly/go_utils.py,sha256=eay9e3Xdc8XxnlC_4SHHjN89k-M9i_cFMc2lI_ZFxqY,5596
|
|
23
23
|
mgnify_pipelines_toolkit/analysis/assembly/krona_txt_from_cat_classification.py,sha256=uex2T6GagtYFBIc39-Xm4SFHL06KAQ5v0_loOmY_eaw,4289
|
|
24
|
+
mgnify_pipelines_toolkit/analysis/assembly/process_dbcan_result_cazys.py,sha256=dlsXfLPvm1qXRQplM0d5q0JMv9AuD9ytZn3utgYDYAM,7712
|
|
25
|
+
mgnify_pipelines_toolkit/analysis/assembly/process_dbcan_result_clusters.py,sha256=61zVSuQ6Jw2hZU6QOa05lNcrr8Ylzpy2Tes-QaX8kFw,5888
|
|
26
|
+
mgnify_pipelines_toolkit/analysis/assembly/summarise_antismash_bgcs.py,sha256=eRAQ0vFbqnWreiBdtFuwLKve9WwYwv9dYQtD1pumaZs,10776
|
|
24
27
|
mgnify_pipelines_toolkit/analysis/assembly/summarise_goslims.py,sha256=TPaKlYkoy37_XgYNOskWCCoXtPNku_k5ygSeK4fT1VQ,6689
|
|
28
|
+
mgnify_pipelines_toolkit/analysis/assembly/summarise_sanntis_bgcs.py,sha256=65szj-H8Hxy_eXy3TyTs48EhPJbJ2w1skHlVbH2YeVM,4538
|
|
25
29
|
mgnify_pipelines_toolkit/analysis/genomes/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
26
30
|
mgnify_pipelines_toolkit/analysis/shared/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
27
|
-
mgnify_pipelines_toolkit/analysis/shared/convert_cmscan_to_cmsearch_tblout.py,sha256=
|
|
31
|
+
mgnify_pipelines_toolkit/analysis/shared/convert_cmscan_to_cmsearch_tblout.py,sha256=kAGU5kQyj-Hlcdx32i-xOJSuHYYUDj-kqnyYHMohHGc,4477
|
|
28
32
|
mgnify_pipelines_toolkit/analysis/shared/dwc_summary_generator.py,sha256=hggPqv9QawWAccm5tmru4VF9VnQAHF5LCXnqyLw_BWI,6727
|
|
29
33
|
mgnify_pipelines_toolkit/analysis/shared/fastq_suffix_header_check.py,sha256=ye0Jka6_lNn4dQGb2QG3YT46y7QK0QvyaIitIaS8JVQ,4026
|
|
30
34
|
mgnify_pipelines_toolkit/analysis/shared/get_subunits.py,sha256=UrU0CpZj3pfHZWI7Uuhv2a_C0JsO8pnVErY0sWGgNdw,4920
|
|
@@ -38,15 +42,15 @@ mgnify_pipelines_toolkit/constants/ncrna.py,sha256=a_5hWp446S7BhRbe_JcydFgZM7sgP
|
|
|
38
42
|
mgnify_pipelines_toolkit/constants/regex_ambiguous_bases.py,sha256=7nEOODQq35y9wx9YnvJuo29oBpwTpXg_kIbf_t7N4TQ,1093
|
|
39
43
|
mgnify_pipelines_toolkit/constants/regex_fasta_header.py,sha256=G-xrc9b8zdmPTaOICD2b3RCVeFAEOVkfRkIfotQ7gek,1193
|
|
40
44
|
mgnify_pipelines_toolkit/constants/tax_ranks.py,sha256=kMq__kOJcbiwsgolkdvb-XLo3WMnJdEXgedjUyMOYjI,1081
|
|
41
|
-
mgnify_pipelines_toolkit/constants/thresholds.py,sha256=
|
|
45
|
+
mgnify_pipelines_toolkit/constants/thresholds.py,sha256=V_xDBk0RhS3hHeWqOacKzth2gM6zJABRPgwHy-Ciqfk,1157
|
|
42
46
|
mgnify_pipelines_toolkit/constants/var_region_coordinates.py,sha256=0bM4MwarFiM5yTcp5AbAmQ0o-q-gWy7kknir9zJ9R0A,1312
|
|
43
47
|
mgnify_pipelines_toolkit/schemas/schemas.py,sha256=pnH8LUH8i2ACNvFNWyG-n-eIHZcI5O9UDYulkh43mec,7692
|
|
44
48
|
mgnify_pipelines_toolkit/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
45
49
|
mgnify_pipelines_toolkit/utils/fasta_to_delimited.py,sha256=lgYIR1S4crURY7C7nFtgE6QMV4u4zCNsUrVkcRnsEEo,3996
|
|
46
50
|
mgnify_pipelines_toolkit/utils/get_mpt_version.py,sha256=aS9bWrC9CP7tpxoEVg6eEYt18-pmjG7fJl5Mchz4YOU,798
|
|
47
|
-
mgnify_pipelines_toolkit-1.0.
|
|
48
|
-
mgnify_pipelines_toolkit-1.0.
|
|
49
|
-
mgnify_pipelines_toolkit-1.0.
|
|
50
|
-
mgnify_pipelines_toolkit-1.0.
|
|
51
|
-
mgnify_pipelines_toolkit-1.0.
|
|
52
|
-
mgnify_pipelines_toolkit-1.0.
|
|
51
|
+
mgnify_pipelines_toolkit-1.0.5.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
|
52
|
+
mgnify_pipelines_toolkit-1.0.5.dist-info/METADATA,sha256=fp1mfamjDfteh42zghP_y8br_zRaEYGzAhbDB7O72mc,5810
|
|
53
|
+
mgnify_pipelines_toolkit-1.0.5.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
|
|
54
|
+
mgnify_pipelines_toolkit-1.0.5.dist-info/entry_points.txt,sha256=T8soGT2to8c_qafw-0itqCn4sjOnxlfaNWHIaHz4H54,3416
|
|
55
|
+
mgnify_pipelines_toolkit-1.0.5.dist-info/top_level.txt,sha256=xA_wC7C01V3VwuDnqwRM2QYeJJ45WtvF6LVav4tYxuE,25
|
|
56
|
+
mgnify_pipelines_toolkit-1.0.5.dist-info/RECORD,,
|
|
@@ -18,6 +18,7 @@ genomes_extract_trnas = mgnify_pipelines_toolkit.analysis.genomes.rna.extract_tr
|
|
|
18
18
|
get_mpt_version = mgnify_pipelines_toolkit.utils.get_mpt_version:main
|
|
19
19
|
get_subunits = mgnify_pipelines_toolkit.analysis.shared.get_subunits:main
|
|
20
20
|
get_subunits_coords = mgnify_pipelines_toolkit.analysis.shared.get_subunits_coords:main
|
|
21
|
+
gff_toolkit = mgnify_pipelines_toolkit.analysis.assembly.gff_toolkit:main
|
|
21
22
|
krona_txt_from_cat_classification = mgnify_pipelines_toolkit.analysis.assembly.krona_txt_from_cat_classification:main
|
|
22
23
|
library_strategy_check = mgnify_pipelines_toolkit.analysis.shared.library_strategy_check:main
|
|
23
24
|
make_asv_count_table = mgnify_pipelines_toolkit.analysis.amplicon.make_asv_count_table:main
|
|
@@ -25,8 +26,12 @@ mapseq2biom = mgnify_pipelines_toolkit.analysis.shared.mapseq2biom:main
|
|
|
25
26
|
mapseq_to_asv_table = mgnify_pipelines_toolkit.analysis.amplicon.mapseq_to_asv_table:main
|
|
26
27
|
markergene_study_summary = mgnify_pipelines_toolkit.analysis.shared.markergene_study_summary:main
|
|
27
28
|
primer_val_classification = mgnify_pipelines_toolkit.analysis.amplicon.primer_val_classification:main
|
|
29
|
+
process_dbcan_cazys = mgnify_pipelines_toolkit.analysis.assembly.process_dbcan_result_cazys:main
|
|
30
|
+
process_dbcan_clusters = mgnify_pipelines_toolkit.analysis.assembly.process_dbcan_result_clusters:main
|
|
28
31
|
remove_ambiguous_reads = mgnify_pipelines_toolkit.analysis.amplicon.remove_ambiguous_reads:main
|
|
29
32
|
rev_comp_se_primers = mgnify_pipelines_toolkit.analysis.amplicon.rev_comp_se_primers:main
|
|
30
33
|
standard_primer_matching = mgnify_pipelines_toolkit.analysis.amplicon.standard_primer_matching:main
|
|
31
34
|
study_summary_generator = mgnify_pipelines_toolkit.analysis.shared.study_summary_generator:cli
|
|
35
|
+
summarise_antismash_bgcs = mgnify_pipelines_toolkit.analysis.assembly.summarise_antismash_bgcs:main
|
|
32
36
|
summarise_goslims = mgnify_pipelines_toolkit.analysis.assembly.summarise_goslims:main
|
|
37
|
+
summarise_sanntis_bgcs = mgnify_pipelines_toolkit.analysis.assembly.summarise_sanntis_bgcs:main
|
|
File without changes
|
|
File without changes
|
{mgnify_pipelines_toolkit-1.0.4.dist-info → mgnify_pipelines_toolkit-1.0.5.dist-info}/top_level.txt
RENAMED
|
File without changes
|